Spaces:
Sleeping
Sleeping
Upload 313 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +2 -0
- hf_demo/README.md +7 -0
- hf_demo/db.sqlite3 +0 -0
- hf_demo/einstein/__init__.py +0 -0
- hf_demo/einstein/__pycache__/__init__.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/admin.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/apps.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/constants.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/ml_service.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/models.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/urls.cpython-38.pyc +0 -0
- hf_demo/einstein/__pycache__/views.cpython-38.pyc +0 -0
- hf_demo/einstein/admin.py +3 -0
- hf_demo/einstein/apps.py +6 -0
- hf_demo/einstein/constants.py +52 -0
- hf_demo/einstein/migrations/__init__.py +0 -0
- hf_demo/einstein/migrations/__pycache__/__init__.cpython-38.pyc +0 -0
- hf_demo/einstein/ml_service.py +388 -0
- hf_demo/einstein/models.py +3 -0
- hf_demo/einstein/tests.py +3 -0
- hf_demo/einstein/urls.py +6 -0
- hf_demo/einstein/views.py +53 -0
- hf_demo/fastText/.circleci/cmake_test.sh +18 -0
- hf_demo/fastText/.circleci/config.yml +196 -0
- hf_demo/fastText/.circleci/gcc_test.sh +25 -0
- hf_demo/fastText/.circleci/pip_test.sh +11 -0
- hf_demo/fastText/.circleci/pull_data.sh +33 -0
- hf_demo/fastText/.circleci/python_test.sh +11 -0
- hf_demo/fastText/.circleci/run_locally.sh +13 -0
- hf_demo/fastText/.circleci/setup_circleimg.sh +11 -0
- hf_demo/fastText/.circleci/setup_debian.sh +11 -0
- hf_demo/fastText/.gitignore +12 -0
- hf_demo/fastText/CMakeLists.txt +80 -0
- hf_demo/fastText/CODE_OF_CONDUCT.md +77 -0
- hf_demo/fastText/CONTRIBUTING.md +32 -0
- hf_demo/fastText/LICENSE +21 -0
- hf_demo/fastText/MANIFEST.in +5 -0
- hf_demo/fastText/Makefile +125 -0
- hf_demo/fastText/PACKAGE +3 -0
- hf_demo/fastText/README.md +339 -0
- hf_demo/fastText/alignment/README.md +67 -0
- hf_demo/fastText/alignment/align.py +145 -0
- hf_demo/fastText/alignment/eval.py +60 -0
- hf_demo/fastText/alignment/example.sh +51 -0
- hf_demo/fastText/alignment/unsup_align.py +109 -0
- hf_demo/fastText/alignment/unsup_multialign.py +198 -0
- hf_demo/fastText/alignment/utils.py +154 -0
- hf_demo/fastText/classification-example.sh +41 -0
- hf_demo/fastText/classification-results.sh +94 -0
- hf_demo/fastText/crawl/README.md +26 -0
.gitattributes
CHANGED
|
@@ -34,3 +34,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
hf_demo/fastText/website/static/img/authors/tomas_mikolov.jpg filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
hf_demo/ml_models/sentiment_model/model.ft filter=lfs diff=lfs merge=lfs -text
|
hf_demo/README.md
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1. pip install -r requirements.txt
|
| 2 |
+
2. next run the following commands in the environment:
|
| 3 |
+
- pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1.tar.gz
|
| 4 |
+
- git clone https://github.com/facebookresearch/fastText.git
|
| 5 |
+
- cd fastText
|
| 6 |
+
- pip install .
|
| 7 |
+
3. add ml_models in the project base directory
|
hf_demo/db.sqlite3
ADDED
|
File without changes
|
hf_demo/einstein/__init__.py
ADDED
|
File without changes
|
hf_demo/einstein/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (140 Bytes). View file
|
|
|
hf_demo/einstein/__pycache__/admin.cpython-38.pyc
ADDED
|
Binary file (181 Bytes). View file
|
|
|
hf_demo/einstein/__pycache__/apps.cpython-38.pyc
ADDED
|
Binary file (421 Bytes). View file
|
|
|
hf_demo/einstein/__pycache__/constants.cpython-38.pyc
ADDED
|
Binary file (1.4 kB). View file
|
|
|
hf_demo/einstein/__pycache__/ml_service.cpython-38.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
hf_demo/einstein/__pycache__/models.cpython-38.pyc
ADDED
|
Binary file (178 Bytes). View file
|
|
|
hf_demo/einstein/__pycache__/urls.cpython-38.pyc
ADDED
|
Binary file (316 Bytes). View file
|
|
|
hf_demo/einstein/__pycache__/views.cpython-38.pyc
ADDED
|
Binary file (2 kB). View file
|
|
|
hf_demo/einstein/admin.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.contrib import admin
|
| 2 |
+
|
| 3 |
+
# Register your models here.
|
hf_demo/einstein/apps.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.apps import AppConfig
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class EinsteinConfig(AppConfig):
|
| 5 |
+
default_auto_field = 'django.db.models.BigAutoField'
|
| 6 |
+
name = 'einstein'
|
hf_demo/einstein/constants.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
POSITIVE_SENTIMENT_PATTERNS = [
|
| 2 |
+
'thankful',
|
| 3 |
+
'grateful',
|
| 4 |
+
'terrific',
|
| 5 |
+
'sensational',
|
| 6 |
+
'marvelous',
|
| 7 |
+
'phenomenal',
|
| 8 |
+
'perfect',
|
| 9 |
+
'fantastic',
|
| 10 |
+
'splendid',
|
| 11 |
+
'first class',
|
| 12 |
+
'first-class',
|
| 13 |
+
'brilliant',
|
| 14 |
+
'awesome',
|
| 15 |
+
'superb',
|
| 16 |
+
'amazing'
|
| 17 |
+
]
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
CATEGORY_THRESHOLD = {
|
| 21 |
+
'AMENITIES': 0.47000000000000003,
|
| 22 |
+
'CLEANLINESS': 0.31,
|
| 23 |
+
'COMMUNICATION': 0.25,
|
| 24 |
+
'CONDITION': 0.15000000000000002,
|
| 25 |
+
'CUSTOMER_SERVICE': 0.35000000000000003,
|
| 26 |
+
'EXTERIOR_LIGHTING': 0.33,
|
| 27 |
+
'FINANCIAL': 0.66,
|
| 28 |
+
'INTERIOR_LIGHTING': 0.54,
|
| 29 |
+
'INTERNET': 0.02,
|
| 30 |
+
'LANDSCAPING_GROUNDS': 0.26,
|
| 31 |
+
'MAINTENANCE_CLEANLINESS': 0.01,
|
| 32 |
+
'MAINTENANCE_SERVICE': 0.48000000000000004,
|
| 33 |
+
'MAINTENANCE_TIMELINESS': 0.62,
|
| 34 |
+
'MOVE_IN_QUALITY': 0.18000000000000002,
|
| 35 |
+
'NOISE': 0.14,
|
| 36 |
+
'PACKAGES_MAIL': 0.15000000000000002,
|
| 37 |
+
'PARKING': 0.27,
|
| 38 |
+
'PESTS': 0.64,
|
| 39 |
+
'PET_WASTE': 0.33,
|
| 40 |
+
'SECURITY': 0.18000000000000002,
|
| 41 |
+
'SMOKE': 0.06999999999999999,
|
| 42 |
+
'TRASH': 0.4
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
LABEL_COLOR = {'COMMUNICATION': '#FF0000', 'AMENITIES': '#00B050', 'CLEANLINESS': '#00B0F0', 'CONDITION': '#9999FF',
|
| 47 |
+
'CUSTOMER_SERVICE': '#00FFFF', 'FINANCIAL': '#666699', 'LANDSCAPING_GROUNDS': '#800000',
|
| 48 |
+
'MAINTENANCE_CLEANLINESS': '#7030A0', 'MAINTENANCE_SERVICE': '#993366',
|
| 49 |
+
'MAINTENANCE_TIMELINESS': '#FF0066', 'MOVE_IN_QUALITY': '#CC9900', 'NOISE': '#FFC000',
|
| 50 |
+
'PACKAGES_MAIL': '#CC6600', 'PARKING': '#FF9966', 'PESTS': '#FF00FF', 'PET_WASTE': '#000066',
|
| 51 |
+
'SECURITY': '#0000FF', 'SMOKE': '#808080', 'TRASH': '#808000', 'EXTERIOR_LIGHTING': '#00FFCC',
|
| 52 |
+
'INTERNET': '#33CC33', 'INTERIOR_LIGHTING': '#008080'}
|
hf_demo/einstein/migrations/__init__.py
ADDED
|
File without changes
|
hf_demo/einstein/migrations/__pycache__/__init__.cpython-38.pyc
ADDED
|
Binary file (151 Bytes). View file
|
|
|
hf_demo/einstein/ml_service.py
ADDED
|
@@ -0,0 +1,388 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import itertools
|
| 2 |
+
import json
|
| 3 |
+
import re
|
| 4 |
+
import fasttext
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import spacy
|
| 7 |
+
from simpletransformers.ner import NERModel
|
| 8 |
+
from spacy.matcher import PhraseMatcher
|
| 9 |
+
from einstein.constants import POSITIVE_SENTIMENT_PATTERNS, LABEL_COLOR, CATEGORY_THRESHOLD
|
| 10 |
+
from django.conf import settings
|
| 11 |
+
from emoji import demojize
|
| 12 |
+
import unicodedata
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
base_directory = settings.BASE_DIR
|
| 16 |
+
|
| 17 |
+
labels_file = f"{base_directory}/ml_models/labels.json"
|
| 18 |
+
ner_model_directory = f"{base_directory}/ml_models/ner_model/"
|
| 19 |
+
sentiment_model_file = f"{base_directory}/ml_models/sentiment_model/model.ft"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class MlProcessing:
|
| 23 |
+
def __init__(self, comment_dict):
|
| 24 |
+
self.comment_dict = comment_dict
|
| 25 |
+
self.is_cleaned = False
|
| 26 |
+
|
| 27 |
+
def remove_prefix(self, label):
|
| 28 |
+
return label.split('-')[-1]
|
| 29 |
+
|
| 30 |
+
def labels_to_spans(self, tokens, labels):
|
| 31 |
+
spans = []
|
| 32 |
+
for label, group in itertools.groupby(zip(tokens, labels), key=lambda x: self.remove_prefix(x[1])):
|
| 33 |
+
if label == 'O':
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
group_tokens = [t for t, _ in group]
|
| 37 |
+
spans.append({'label': label, 'start': group_tokens[0]['start'], 'end': group_tokens[-1]['end'],
|
| 38 |
+
'n_tokens': len(group_tokens)})
|
| 39 |
+
|
| 40 |
+
return spans
|
| 41 |
+
|
| 42 |
+
def score_to_str(self, score):
|
| 43 |
+
if pd.isna(score):
|
| 44 |
+
return ''
|
| 45 |
+
return f'RATING_{int(score)}'
|
| 46 |
+
|
| 47 |
+
def configure_matcher(self, nlp, patterns):
|
| 48 |
+
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
|
| 49 |
+
patterns = [nlp.make_doc(p) for p in patterns]
|
| 50 |
+
matcher.add('positive', patterns)
|
| 51 |
+
return matcher
|
| 52 |
+
|
| 53 |
+
def cleaner(self):
|
| 54 |
+
cleaner = ReviewsCleaner()
|
| 55 |
+
self.comment_dict['text'] = cleaner.clean_text(self.comment_dict['text'])
|
| 56 |
+
self.comment_dict['cleaned'] = True
|
| 57 |
+
self.is_cleaned = True
|
| 58 |
+
|
| 59 |
+
def clip(self, x, min_, max_):
|
| 60 |
+
if x < min_:
|
| 61 |
+
return min_
|
| 62 |
+
if x > max_:
|
| 63 |
+
return max_
|
| 64 |
+
return x
|
| 65 |
+
|
| 66 |
+
def get_score(self):
|
| 67 |
+
record = dict()
|
| 68 |
+
if "star_rating" in self.comment_dict and self.comment_dict['star_rating'] is not None and str(self.comment_dict['star_rating']).isnumeric():
|
| 69 |
+
record["score"] = self.clip(float(self.comment_dict['star_rating']), 0, 5)
|
| 70 |
+
elif 'tali_score' in self.comment_dict and self.comment_dict['tali_score'] is not None and str(self.comment_dict['tali_score']).isnumeric():
|
| 71 |
+
record['score'] = self.clip(float(self.comment_dict['tali_score']) // 2, 0, 5)
|
| 72 |
+
else:
|
| 73 |
+
record['score'] = None
|
| 74 |
+
|
| 75 |
+
record['score_str'] = self.score_to_str(record['score'])
|
| 76 |
+
|
| 77 |
+
return record
|
| 78 |
+
|
| 79 |
+
def reformat_output(self, data):
|
| 80 |
+
text = data["text"]
|
| 81 |
+
spans = data.get("spans", list())
|
| 82 |
+
new_spans = list()
|
| 83 |
+
previous_span_end = -1
|
| 84 |
+
for i, span in enumerate(spans):
|
| 85 |
+
span_start = span["start"]
|
| 86 |
+
span_end = span["end"]
|
| 87 |
+
|
| 88 |
+
# there's some unlabelled span between the last added span and present labelled span
|
| 89 |
+
# this would work for first span as well
|
| 90 |
+
if span_start != previous_span_end + 1:
|
| 91 |
+
new_spans.append({
|
| 92 |
+
"label": text[previous_span_end + 1:span_start],
|
| 93 |
+
"color": "",
|
| 94 |
+
"value": "",
|
| 95 |
+
"sentiment": "",
|
| 96 |
+
"score": None
|
| 97 |
+
})
|
| 98 |
+
|
| 99 |
+
# Add the present span
|
| 100 |
+
new_spans.append({
|
| 101 |
+
"label": text[span_start:span_end],
|
| 102 |
+
"color": LABEL_COLOR[span["label"]],
|
| 103 |
+
"value": span["label"],
|
| 104 |
+
"sentiment": span["sentiment"],
|
| 105 |
+
"score": span["score"]
|
| 106 |
+
})
|
| 107 |
+
|
| 108 |
+
previous_span_end = span_end
|
| 109 |
+
|
| 110 |
+
# If the added span is the last labelled span but there's unlabelled text remaining
|
| 111 |
+
# that needs to be added
|
| 112 |
+
if (i == len(spans) - 1) and span_end < len(text):
|
| 113 |
+
new_spans.append({
|
| 114 |
+
"label": text[span_end:],
|
| 115 |
+
"color": "",
|
| 116 |
+
"value": "",
|
| 117 |
+
"sentiment": "",
|
| 118 |
+
"score": None,
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
previous_span_end = len(text)
|
| 122 |
+
|
| 123 |
+
data.update({"spans": new_spans})
|
| 124 |
+
|
| 125 |
+
def preprocess_text(self, text):
|
| 126 |
+
text = text.lower()
|
| 127 |
+
text = re.sub('(?<=\.)\.', ' ', text)
|
| 128 |
+
text = text.strip().strip('. ",')
|
| 129 |
+
text = text.replace('\n', ' ')
|
| 130 |
+
text = text.replace('’', "'")
|
| 131 |
+
text = re.sub('\s+', ' ', text)
|
| 132 |
+
return text
|
| 133 |
+
|
| 134 |
+
def predict(self, model, text, category):
|
| 135 |
+
text = self.preprocess_text(text)
|
| 136 |
+
labels, probs = model.predict(text, k=2)
|
| 137 |
+
|
| 138 |
+
if labels[0] == '__label__POSITIVE':
|
| 139 |
+
prob = probs[0]
|
| 140 |
+
else:
|
| 141 |
+
prob = probs[1]
|
| 142 |
+
|
| 143 |
+
if prob >= CATEGORY_THRESHOLD[category]:
|
| 144 |
+
label = 'POSITIVE'
|
| 145 |
+
else:
|
| 146 |
+
label = 'NEGATIVE'
|
| 147 |
+
|
| 148 |
+
return {'label': label, 'score': prob}
|
| 149 |
+
|
| 150 |
+
def apply_sentiment_model(self, review_dict_entities):
|
| 151 |
+
nlp = settings.LANGUAGE_MODEL
|
| 152 |
+
sentence_finder = SentenceBoundsFinder(nlp)
|
| 153 |
+
positive_sentiment_matcher = self.configure_matcher(nlp, POSITIVE_SENTIMENT_PATTERNS)
|
| 154 |
+
sentiment_model = self.load_sentiment_model()
|
| 155 |
+
if self.comment_dict['skip']:
|
| 156 |
+
return self.comment_dict
|
| 157 |
+
|
| 158 |
+
review = re.sub(r'["“”]|_x000D_', ' ', self.comment_dict['text'])
|
| 159 |
+
sentence_bounds = sentence_finder(review)
|
| 160 |
+
for span in self.comment_dict.get('spans', []):
|
| 161 |
+
segment_text = self.comment_dict['text'][span['start']:span['end']].replace('\n', ' ')
|
| 162 |
+
segment_doc = nlp(segment_text)
|
| 163 |
+
matches = positive_sentiment_matcher(segment_doc)
|
| 164 |
+
|
| 165 |
+
if matches:
|
| 166 |
+
sentiments = {'label': 'POSITIVE', 'score': 1.}
|
| 167 |
+
span['sentiment'] = sentiments.get('label')
|
| 168 |
+
span['score'] = sentiments.get('score')
|
| 169 |
+
else:
|
| 170 |
+
span_start = self.get_sentence_start(sentence_bounds, span['start'])
|
| 171 |
+
text = self.comment_dict['text'][span_start:span['end']].replace('\n', ' ')
|
| 172 |
+
text = f"{self.comment_dict['score_str'].lower()} {span['label'].lower()} {text}"
|
| 173 |
+
sentiments = self.predict(sentiment_model, text, span['label'])
|
| 174 |
+
span['sentiment'] = sentiments.get('label')
|
| 175 |
+
span['score'] = sentiments.get('score')
|
| 176 |
+
return self.comment_dict
|
| 177 |
+
|
| 178 |
+
def load_sentiment_model(self):
|
| 179 |
+
return settings.SENTIMENT_MODEL
|
| 180 |
+
# return fasttext.load_model(sentiment_model_file)
|
| 181 |
+
|
| 182 |
+
def get_sentence_start(self, sentence_bounds, position):
|
| 183 |
+
for start, end in sentence_bounds:
|
| 184 |
+
if start <= position <= end:
|
| 185 |
+
return start
|
| 186 |
+
|
| 187 |
+
raise RuntimeError('Failed to get sentence bound')
|
| 188 |
+
|
| 189 |
+
def load_ner_model(self, max_seq_len=500, use_multiprocessing=True):
|
| 190 |
+
args = {'overwrite_output_dir': False, 'reprocess_input_data': True, 'num_train_epochs': 30,
|
| 191 |
+
'evaluation_strategy': 'epoch', 'evaluate_during_training': True, 'silent': True,
|
| 192 |
+
'max_seq_length': max_seq_len, 'use_multiprocessing': use_multiprocessing,
|
| 193 |
+
'use_multiprocessing_for_evaluation': use_multiprocessing, 'fp16': True}
|
| 194 |
+
|
| 195 |
+
labels = settings.LABELS
|
| 196 |
+
|
| 197 |
+
return NERModel('longformer', ner_model_directory, args=args, use_cuda=False, labels=labels)
|
| 198 |
+
|
| 199 |
+
def apply_ner_model(self):
|
| 200 |
+
nlp = settings.LANGUAGE_MODEL
|
| 201 |
+
# nlp.add_pipe('sentencizer')
|
| 202 |
+
|
| 203 |
+
regex = re.compile('(\(original.{0,3}\).+)', re.IGNORECASE | re.MULTILINE | re.DOTALL)
|
| 204 |
+
if self.comment_dict['skip']:
|
| 205 |
+
return self.comment_dict
|
| 206 |
+
|
| 207 |
+
self.comment_dict['text'] = regex.sub('', self.comment_dict['text'])
|
| 208 |
+
self.comment_dict['_doc'] = nlp(self.comment_dict['text'])
|
| 209 |
+
|
| 210 |
+
seq_lengths = [len(self.comment_dict['_doc'])]
|
| 211 |
+
seq_lengths = sorted(seq_lengths)
|
| 212 |
+
|
| 213 |
+
len_1 = seq_lengths[int(len(seq_lengths) * 0.8)]
|
| 214 |
+
len_2 = seq_lengths[-1]
|
| 215 |
+
|
| 216 |
+
ner_model_1 = self.load_ner_model(int(1.5 * len_1))
|
| 217 |
+
# ner_model_1 = settings.NER_MODEL_1
|
| 218 |
+
|
| 219 |
+
try:
|
| 220 |
+
model = ner_model_1
|
| 221 |
+
if len(self.comment_dict['_doc']) > len_1:
|
| 222 |
+
ner_model_2 = self.load_ner_model(int(1.5 * len_2))
|
| 223 |
+
# ner_model_2 = settings.NER_MODEL_2
|
| 224 |
+
model = ner_model_2
|
| 225 |
+
self._apply_ner_model(model, self.comment_dict)
|
| 226 |
+
return self.comment_dict
|
| 227 |
+
except Exception as e:
|
| 228 |
+
self.comment_dict['skip'] = True
|
| 229 |
+
|
| 230 |
+
def _apply_ner_model(self, ner_model, item):
|
| 231 |
+
doc = item['_doc']
|
| 232 |
+
del item['_doc']
|
| 233 |
+
|
| 234 |
+
predictions, _ = ner_model.predict([[t.text for t in doc]], split_on_space=False)
|
| 235 |
+
predictions = predictions[0]
|
| 236 |
+
|
| 237 |
+
tokens = doc.to_json()['tokens']
|
| 238 |
+
if len(tokens) != len(predictions):
|
| 239 |
+
# set_failed(db, task, 'Failed to apply NER model.')
|
| 240 |
+
item['spans'] = []
|
| 241 |
+
return
|
| 242 |
+
|
| 243 |
+
for t, p in zip(tokens, predictions):
|
| 244 |
+
t['label'] = list(p.values())[0]
|
| 245 |
+
|
| 246 |
+
labels = [t['label'] for t in tokens]
|
| 247 |
+
|
| 248 |
+
spans = self.labels_to_spans(tokens, labels)
|
| 249 |
+
item['spans'] = self.postprocess_spans(spans)
|
| 250 |
+
|
| 251 |
+
def postprocess_spans(self, spans):
|
| 252 |
+
if spans:
|
| 253 |
+
for j, span in enumerate(list(spans)):
|
| 254 |
+
if span['n_tokens'] < 3:
|
| 255 |
+
if len(spans) > 1:
|
| 256 |
+
if j == 0:
|
| 257 |
+
spans[j]['label'] = spans[j + 1]['label']
|
| 258 |
+
elif j == len(spans) - 1:
|
| 259 |
+
spans[j]['label'] = spans[j - 1]['label']
|
| 260 |
+
elif spans[j - 1]['label'] == spans[j + 1]['label']:
|
| 261 |
+
spans[j]['label'] = spans[j - 1]['label']
|
| 262 |
+
else:
|
| 263 |
+
spans[j]['label'] = 'O'
|
| 264 |
+
else:
|
| 265 |
+
spans[j]['label'] = 'O'
|
| 266 |
+
|
| 267 |
+
new_spans = []
|
| 268 |
+
for label, label_spans in itertools.groupby(spans, key=lambda s: s['label']):
|
| 269 |
+
if label == 'O':
|
| 270 |
+
continue
|
| 271 |
+
|
| 272 |
+
label_spans = list(label_spans)
|
| 273 |
+
|
| 274 |
+
new_spans.append({'start': label_spans[0]['start'], 'end': label_spans[-1]['end'], 'label': label})
|
| 275 |
+
|
| 276 |
+
return new_spans
|
| 277 |
+
|
| 278 |
+
def process_comment(self):
|
| 279 |
+
sentiment = dict()
|
| 280 |
+
score_dict = self.get_score()
|
| 281 |
+
self.comment_dict.update(score_dict)
|
| 282 |
+
self.cleaner()
|
| 283 |
+
try:
|
| 284 |
+
review_dict_entities = self.apply_ner_model()
|
| 285 |
+
sentiment = self.apply_sentiment_model(review_dict_entities)
|
| 286 |
+
self.reformat_output(sentiment)
|
| 287 |
+
# for very small texts ner model errors
|
| 288 |
+
except AssertionError:
|
| 289 |
+
self.comment_dict["skip"] = True
|
| 290 |
+
sentiment.update(self.comment_dict)
|
| 291 |
+
# sentiment.update({"spans": [{"label": review_json_cleaned["text"], "color": "", "value": "", "sentiment": "", "score": None}]})
|
| 292 |
+
label_color_mappings = list()
|
| 293 |
+
for label, label_color in LABEL_COLOR.items():
|
| 294 |
+
label_color_mappings.append({"label": label, "color": label_color})
|
| 295 |
+
sentiment.update({"color_map": label_color_mappings})
|
| 296 |
+
return sentiment
|
| 297 |
+
|
| 298 |
+
def main(self):
|
| 299 |
+
return self.process_comment()
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
class SentenceBoundsFinder:
|
| 303 |
+
def __init__(self, nlp=None):
|
| 304 |
+
# self._nlp = nlp or spacy.load('en_core_web_sm')
|
| 305 |
+
self._nlp = nlp or settings.LANGUAGE_MODEL
|
| 306 |
+
# self._nlp.add_pipe('sentencizer')
|
| 307 |
+
|
| 308 |
+
def __call__(self, text):
|
| 309 |
+
bounds = []
|
| 310 |
+
|
| 311 |
+
for sent in self._nlp(text).sents:
|
| 312 |
+
bounds.append((sent.start_char, sent.end_char))
|
| 313 |
+
|
| 314 |
+
return bounds
|
| 315 |
+
|
| 316 |
+
|
| 317 |
+
class ReviewsCleaner:
|
| 318 |
+
"""
|
| 319 |
+
Class for the cleaning of review dataset and collecting statistics on cleaning
|
| 320 |
+
:param replace_emojis: Replace emojis to text representing them
|
| 321 |
+
:param unicode_normalize: Normalize unicode chars
|
| 322 |
+
:param remove_non_regular_chars: Remove chars with ordinal number <128
|
| 323 |
+
:param remove_junk: Remove characters that are not relevant for the reviews and often corrupt tokens (* \n \r \t)
|
| 324 |
+
:param remove_double_spaces: Remove double spaces
|
| 325 |
+
:param remove_boundary_quotes: Remove quotes which on boundaries of text
|
| 326 |
+
:param same_quotes: Transform all quote marks into single quote mark
|
| 327 |
+
"""
|
| 328 |
+
|
| 329 |
+
def __init__(self, replace_emojis=True, unicode_normalize=True, remove_non_regular_chars=True, remove_junk=True,
|
| 330 |
+
remove_double_spaces=True, remove_boundary_quotes=True, same_quotes=True):
|
| 331 |
+
self.methods = []
|
| 332 |
+
# Add new methods here !!! MIND THE ORDER !!!
|
| 333 |
+
if replace_emojis:
|
| 334 |
+
self.methods.append(('Deemojize', lambda text: self.__demojize(text)))
|
| 335 |
+
if unicode_normalize:
|
| 336 |
+
self.methods.append(('Normalize', lambda text: ''.join(
|
| 337 |
+
c for c in unicodedata.normalize('NFD', text) if unicodedata.category(c) != 'Mn')))
|
| 338 |
+
if same_quotes:
|
| 339 |
+
self.methods.append(('Same quotes', lambda text: re.sub('"|’|`|“', '\'', text)))
|
| 340 |
+
if remove_boundary_quotes:
|
| 341 |
+
self.methods.append(('Rm boundary quotes', lambda text: self.__remove_boundary(text)))
|
| 342 |
+
if remove_junk:
|
| 343 |
+
self.methods.append(('Remove junk', lambda text: re.sub('\*|\n|\r|\t|_x000D_', ' ', text)))
|
| 344 |
+
if remove_non_regular_chars:
|
| 345 |
+
self.methods.append(('Remove non-regular', lambda text: ''.join(c for c in text if ord(c) < 128)))
|
| 346 |
+
if remove_double_spaces:
|
| 347 |
+
self.methods.append(('Remove double spaces', lambda text: ' '.join(text.split())))
|
| 348 |
+
self.stats = {name: [0, 0] for name, _ in self.methods} # name, characters changed, reviews affected
|
| 349 |
+
self.analyzed_reviews = 0
|
| 350 |
+
self.skipped = 0
|
| 351 |
+
|
| 352 |
+
def clean_stats(self):
|
| 353 |
+
"""Reset statistics"""
|
| 354 |
+
self.stats = {[name, 0, 0] for name, _ in self.methods}
|
| 355 |
+
self.analyzed_reviews = 0
|
| 356 |
+
|
| 357 |
+
def print_stats(self):
|
| 358 |
+
"""Print statistics of used methods"""
|
| 359 |
+
print(f'Reviews analyzed: {self.analyzed_reviews}')
|
| 360 |
+
print("{:<20} {:<10} {:<10}".format('Name', 'Avg. % of chars', '% of reviews affected'))
|
| 361 |
+
for name, item in self.stats.items():
|
| 362 |
+
print("{:<20} {:<10} {:<10}".format(name, f'{(100 * item[0] / self.analyzed_reviews):.2f}%',
|
| 363 |
+
f'{(100 * item[1] / self.analyzed_reviews):.2f}%'))
|
| 364 |
+
print(f'Language skip\t-\t{(100 * self.skipped / self.analyzed_reviews):.2f}%')
|
| 365 |
+
|
| 366 |
+
def clean_text(self, text):
|
| 367 |
+
"""Clean line of text"""
|
| 368 |
+
self.analyzed_reviews += 1
|
| 369 |
+
if len(text) == 0:
|
| 370 |
+
return text
|
| 371 |
+
|
| 372 |
+
for method_name, method_fun in self.methods:
|
| 373 |
+
text = method_fun(text)
|
| 374 |
+
return text
|
| 375 |
+
|
| 376 |
+
@staticmethod
|
| 377 |
+
def __demojize(text):
|
| 378 |
+
text = demojize(text, delimiters=[' ', ' '])
|
| 379 |
+
text = re.sub('_[a-z]*_skin_tone', '', text)
|
| 380 |
+
return text
|
| 381 |
+
|
| 382 |
+
@staticmethod
|
| 383 |
+
def __remove_boundary(text):
|
| 384 |
+
if text[:1] == '\'':
|
| 385 |
+
text = text[1:]
|
| 386 |
+
if text[-1:] == '\'':
|
| 387 |
+
text = text[:-1]
|
| 388 |
+
return text
|
hf_demo/einstein/models.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.db import models
|
| 2 |
+
|
| 3 |
+
# Create your models here.
|
hf_demo/einstein/tests.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.test import TestCase
|
| 2 |
+
|
| 3 |
+
# Create your tests here.
|
hf_demo/einstein/urls.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.urls import path
|
| 2 |
+
|
| 3 |
+
from einstein.views import FileProcessingView
|
| 4 |
+
urlpatterns = [
|
| 5 |
+
path('file_process/', FileProcessingView.as_view(), name='file-process'),
|
| 6 |
+
]
|
hf_demo/einstein/views.py
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from django.shortcuts import render
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
import io
|
| 6 |
+
from django.conf import settings
|
| 7 |
+
from django.views.generic import View
|
| 8 |
+
from django.http import JsonResponse
|
| 9 |
+
from rest_framework.views import APIView
|
| 10 |
+
from rest_framework.response import Response
|
| 11 |
+
|
| 12 |
+
from einstein.ml_service import MlProcessing
|
| 13 |
+
|
| 14 |
+
class FileProcessingView(APIView):
|
| 15 |
+
def post(self, request):
|
| 16 |
+
file = request.data.get('file')
|
| 17 |
+
|
| 18 |
+
processed_data = self.process(file)
|
| 19 |
+
return Response({"processed_data":processed_data})
|
| 20 |
+
|
| 21 |
+
def process(self, file):
|
| 22 |
+
a = time.time()
|
| 23 |
+
success = True
|
| 24 |
+
file_stream = io.BytesIO(file.read())
|
| 25 |
+
file_name = file.name
|
| 26 |
+
# file_path = os.path.join(self.dir_path, file_name)
|
| 27 |
+
file_extension = os.path.splitext(file_name)[1]
|
| 28 |
+
if file_extension == '.csv':
|
| 29 |
+
df = pd.read_csv(file_stream, encoding='utf-8')
|
| 30 |
+
elif file_extension in ['.xls', '.xlsx']:
|
| 31 |
+
df = pd.read_excel(file_stream)
|
| 32 |
+
date_col_format = self.params.get('date_format', settings.DATE_FORMAT)
|
| 33 |
+
date_header = 'DATE'
|
| 34 |
+
df[date_header] = pd.to_datetime(df[date_header]).dt.strftime(date_col_format)
|
| 35 |
+
else:
|
| 36 |
+
return False
|
| 37 |
+
processed_data = list()
|
| 38 |
+
b = time.time()
|
| 39 |
+
print(f"File parsing time : {b-a} seconds")
|
| 40 |
+
for index, data_obj in df.iterrows():
|
| 41 |
+
data_obj = data_obj.fillna('')
|
| 42 |
+
review_id = data_obj.get('REVIEWID')
|
| 43 |
+
plain_text = data_obj.get('ACTUAL REVIEW', str())
|
| 44 |
+
star_rating = data_obj.get('STAR RATING', 1)
|
| 45 |
+
raw_data = {"text": plain_text, "star_rating": star_rating, "skip": False}
|
| 46 |
+
processed_text = MlProcessing(raw_data).main()
|
| 47 |
+
processed_text.update({'review_id':review_id})
|
| 48 |
+
processed_data.append(processed_text)
|
| 49 |
+
c = time.time()
|
| 50 |
+
print(f"Instances DB loading time : {c -b} seconds")
|
| 51 |
+
return success
|
| 52 |
+
|
| 53 |
+
|
hf_demo/fastText/.circleci/cmake_test.sh
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
RESULTDIR=result
|
| 11 |
+
DATADIR=data
|
| 12 |
+
|
| 13 |
+
./.circleci/pull_data.sh
|
| 14 |
+
mkdir buildc && cd buildc && cmake .. && make && cd ..
|
| 15 |
+
cp buildc/fasttext .
|
| 16 |
+
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
|
| 17 |
+
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
|
| 18 |
+
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
|
hf_demo/fastText/.circleci/config.yml
ADDED
|
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python CircleCI 2.0 configuration file
|
| 2 |
+
#
|
| 3 |
+
# Check https://circleci.com/docs/2.0/language-python/ for more details
|
| 4 |
+
#
|
| 5 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 6 |
+
# All rights reserved.
|
| 7 |
+
#
|
| 8 |
+
# This source code is licensed under the MIT license found in the
|
| 9 |
+
# LICENSE file in the root directory of this source tree.
|
| 10 |
+
#
|
| 11 |
+
|
| 12 |
+
# Maybe one day this will work
|
| 13 |
+
# "mac":
|
| 14 |
+
# macos:
|
| 15 |
+
# xcode: "9.0"
|
| 16 |
+
# working_directory: ~/repo
|
| 17 |
+
# steps:
|
| 18 |
+
# - checkout
|
| 19 |
+
# - run:
|
| 20 |
+
# command: |
|
| 21 |
+
# . .circleci/cmake_test.sh
|
| 22 |
+
|
| 23 |
+
version: 2
|
| 24 |
+
jobs:
|
| 25 |
+
"py368":
|
| 26 |
+
docker:
|
| 27 |
+
- image: circleci/python:3.6.8
|
| 28 |
+
working_directory: ~/repo
|
| 29 |
+
steps:
|
| 30 |
+
- checkout
|
| 31 |
+
- run:
|
| 32 |
+
command: |
|
| 33 |
+
. .circleci/setup_circleimg.sh
|
| 34 |
+
. .circleci/python_test.sh
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
"py357":
|
| 38 |
+
docker:
|
| 39 |
+
- image: circleci/python:3.5.7
|
| 40 |
+
working_directory: ~/repo
|
| 41 |
+
steps:
|
| 42 |
+
- checkout
|
| 43 |
+
- run:
|
| 44 |
+
command: |
|
| 45 |
+
. .circleci/setup_circleimg.sh
|
| 46 |
+
. .circleci/python_test.sh
|
| 47 |
+
|
| 48 |
+
"py3410":
|
| 49 |
+
docker:
|
| 50 |
+
- image: circleci/python:3.4.10
|
| 51 |
+
working_directory: ~/repo
|
| 52 |
+
steps:
|
| 53 |
+
- checkout
|
| 54 |
+
- run:
|
| 55 |
+
command: |
|
| 56 |
+
. .circleci/setup_circleimg.sh
|
| 57 |
+
. .circleci/python_test.sh
|
| 58 |
+
|
| 59 |
+
"py2715":
|
| 60 |
+
docker:
|
| 61 |
+
- image: circleci/python:2.7.15
|
| 62 |
+
working_directory: ~/repo
|
| 63 |
+
steps:
|
| 64 |
+
- checkout
|
| 65 |
+
- run:
|
| 66 |
+
command: |
|
| 67 |
+
. .circleci/setup_circleimg.sh
|
| 68 |
+
. .circleci/python_test.sh
|
| 69 |
+
|
| 70 |
+
"gcc5":
|
| 71 |
+
docker:
|
| 72 |
+
- image: gcc:5
|
| 73 |
+
working_directory: ~/repo
|
| 74 |
+
steps:
|
| 75 |
+
- checkout
|
| 76 |
+
- run:
|
| 77 |
+
command: |
|
| 78 |
+
. .circleci/gcc_test.sh
|
| 79 |
+
|
| 80 |
+
"gcc6":
|
| 81 |
+
docker:
|
| 82 |
+
- image: gcc:6
|
| 83 |
+
working_directory: ~/repo
|
| 84 |
+
steps:
|
| 85 |
+
- checkout
|
| 86 |
+
- run:
|
| 87 |
+
command: |
|
| 88 |
+
. .circleci/gcc_test.sh
|
| 89 |
+
|
| 90 |
+
"gcc7":
|
| 91 |
+
docker:
|
| 92 |
+
- image: gcc:7
|
| 93 |
+
working_directory: ~/repo
|
| 94 |
+
steps:
|
| 95 |
+
- checkout
|
| 96 |
+
- run:
|
| 97 |
+
command: |
|
| 98 |
+
. .circleci/gcc_test.sh
|
| 99 |
+
|
| 100 |
+
"gcclatest":
|
| 101 |
+
docker:
|
| 102 |
+
- image: gcc:latest
|
| 103 |
+
working_directory: ~/repo
|
| 104 |
+
steps:
|
| 105 |
+
- checkout
|
| 106 |
+
- run:
|
| 107 |
+
command: |
|
| 108 |
+
. .circleci/gcc_test.sh
|
| 109 |
+
|
| 110 |
+
"debian-stretch-gcc":
|
| 111 |
+
docker:
|
| 112 |
+
- image: debian:stretch
|
| 113 |
+
working_directory: ~/repo
|
| 114 |
+
steps:
|
| 115 |
+
- checkout
|
| 116 |
+
- run:
|
| 117 |
+
command: |
|
| 118 |
+
. .circleci/setup_debian.sh
|
| 119 |
+
. .circleci/gcc_test.sh
|
| 120 |
+
|
| 121 |
+
"debian-stretch-cmake":
|
| 122 |
+
docker:
|
| 123 |
+
- image: debian:stretch
|
| 124 |
+
working_directory: ~/repo
|
| 125 |
+
steps:
|
| 126 |
+
- checkout
|
| 127 |
+
- run:
|
| 128 |
+
command: |
|
| 129 |
+
. .circleci/setup_debian.sh
|
| 130 |
+
. .circleci/cmake_test.sh
|
| 131 |
+
|
| 132 |
+
"debian-stretch-python":
|
| 133 |
+
docker:
|
| 134 |
+
- image: debian:stretch
|
| 135 |
+
working_directory: ~/repo
|
| 136 |
+
steps:
|
| 137 |
+
- checkout
|
| 138 |
+
- run:
|
| 139 |
+
command: |
|
| 140 |
+
. .circleci/setup_debian.sh
|
| 141 |
+
pip install .
|
| 142 |
+
python runtests.py -u
|
| 143 |
+
|
| 144 |
+
"debian-jessie-gcc":
|
| 145 |
+
docker:
|
| 146 |
+
- image: debian:jessie
|
| 147 |
+
working_directory: ~/repo
|
| 148 |
+
steps:
|
| 149 |
+
- checkout
|
| 150 |
+
- run:
|
| 151 |
+
command: |
|
| 152 |
+
. .circleci/setup_debian.sh
|
| 153 |
+
. .circleci/gcc_test.sh
|
| 154 |
+
|
| 155 |
+
"debian-jessie-cmake":
|
| 156 |
+
docker:
|
| 157 |
+
- image: debian:jessie
|
| 158 |
+
working_directory: ~/repo
|
| 159 |
+
steps:
|
| 160 |
+
- checkout
|
| 161 |
+
- run:
|
| 162 |
+
command: |
|
| 163 |
+
. .circleci/setup_debian.sh
|
| 164 |
+
. .circleci/cmake_test.sh
|
| 165 |
+
|
| 166 |
+
"website-build":
|
| 167 |
+
docker:
|
| 168 |
+
- image: node:latest
|
| 169 |
+
working_directory: ~/repo
|
| 170 |
+
steps:
|
| 171 |
+
- checkout
|
| 172 |
+
- run:
|
| 173 |
+
command: |
|
| 174 |
+
git config --global user.email "docusaurus-bot@users.noreply.github.com"
|
| 175 |
+
git config --global user.name "Website Deployment Script"
|
| 176 |
+
echo "machine github.com login docusaurus-bot password $GITHUB_TOKEN_DOCUSAURUS_BOT" > ~/.netrc
|
| 177 |
+
cd website && npm install && GIT_USER=docusaurus-bot npm run publish-gh-pages
|
| 178 |
+
|
| 179 |
+
workflows:
|
| 180 |
+
version: 2
|
| 181 |
+
build:
|
| 182 |
+
jobs:
|
| 183 |
+
- "py368"
|
| 184 |
+
- "py357"
|
| 185 |
+
- "py3410"
|
| 186 |
+
- "py2715"
|
| 187 |
+
- "gcc5"
|
| 188 |
+
- "gcc6"
|
| 189 |
+
- "gcc7"
|
| 190 |
+
- "gcclatest"
|
| 191 |
+
- "website-build"
|
| 192 |
+
- "debian-stretch-gcc"
|
| 193 |
+
- "debian-stretch-cmake"
|
| 194 |
+
- "debian-stretch-python"
|
| 195 |
+
- "debian-jessie-gcc"
|
| 196 |
+
- "debian-jessie-cmake"
|
hf_demo/fastText/.circleci/gcc_test.sh
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
RESULTDIR=result
|
| 11 |
+
DATADIR=data
|
| 12 |
+
|
| 13 |
+
./.circleci/pull_data.sh
|
| 14 |
+
make opt
|
| 15 |
+
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
|
| 16 |
+
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
|
| 17 |
+
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
|
| 18 |
+
|
| 19 |
+
make clean
|
| 20 |
+
make debug
|
| 21 |
+
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4 -verbose 0
|
| 22 |
+
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
|
| 23 |
+
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
|
| 24 |
+
|
| 25 |
+
|
hf_demo/fastText/.circleci/pip_test.sh
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
sudo pip install --index-url https://test.pypi.org/simple/ fasttext
|
| 11 |
+
python runtests.py -u
|
hf_demo/fastText/.circleci/pull_data.sh
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
myshuf() {
|
| 11 |
+
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
normalize_text() {
|
| 15 |
+
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
|
| 16 |
+
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
|
| 17 |
+
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
| 18 |
+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
RESULTDIR=result
|
| 22 |
+
DATADIR=data
|
| 23 |
+
|
| 24 |
+
mkdir -p "${RESULTDIR}"
|
| 25 |
+
mkdir -p "${DATADIR}"
|
| 26 |
+
|
| 27 |
+
if [ ! -f "${DATADIR}/dbpedia.train" ]
|
| 28 |
+
then
|
| 29 |
+
wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz"
|
| 30 |
+
tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
|
| 31 |
+
cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
|
| 32 |
+
cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
|
| 33 |
+
fi
|
hf_demo/fastText/.circleci/python_test.sh
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
sudo pip install .
|
| 11 |
+
python runtests.py -u
|
hf_demo/fastText/.circleci/run_locally.sh
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
# This script illustrates how to run the build tests locally
|
| 11 |
+
# This requires docker
|
| 12 |
+
|
| 13 |
+
tail -n 15 .circleci/config.yml | sed s/.\\+\"\\\(\.\\+\\\)\"/\\1/g | xargs -P 4 -o -I {} bash -c "circleci build --job {} && (>&2 echo "{}")" > /dev/null
|
hf_demo/fastText/.circleci/setup_circleimg.sh
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
sudo apt-get update
|
| 11 |
+
sudo apt-get install -y cmake python-pip python-dev build-essential
|
hf_demo/fastText/.circleci/setup_debian.sh
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
apt-get update
|
| 11 |
+
apt-get install -y vim g++ make cmake wget git python-pip python-dev build-essential
|
hf_demo/fastText/.gitignore
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.*.swp
|
| 2 |
+
*.o
|
| 3 |
+
*.bin
|
| 4 |
+
*.vec
|
| 5 |
+
*.bc
|
| 6 |
+
.DS_Store
|
| 7 |
+
data
|
| 8 |
+
fasttext
|
| 9 |
+
result
|
| 10 |
+
website/node_modules/
|
| 11 |
+
package-lock.json
|
| 12 |
+
node_modules/
|
hf_demo/fastText/CMakeLists.txt
ADDED
|
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
#
|
| 8 |
+
|
| 9 |
+
cmake_minimum_required(VERSION 2.8.9)
|
| 10 |
+
project(fasttext)
|
| 11 |
+
|
| 12 |
+
set(CMAKE_CXX_STANDARD 17)
|
| 13 |
+
|
| 14 |
+
# The version number.
|
| 15 |
+
set (fasttext_VERSION_MAJOR 0)
|
| 16 |
+
set (fasttext_VERSION_MINOR 1)
|
| 17 |
+
|
| 18 |
+
include_directories(fasttext)
|
| 19 |
+
|
| 20 |
+
set(CMAKE_CXX_FLAGS " -pthread -std=c++17 -funroll-loops -O3 -march=native")
|
| 21 |
+
|
| 22 |
+
set(HEADER_FILES
|
| 23 |
+
src/args.h
|
| 24 |
+
src/autotune.h
|
| 25 |
+
src/densematrix.h
|
| 26 |
+
src/dictionary.h
|
| 27 |
+
src/fasttext.h
|
| 28 |
+
src/loss.h
|
| 29 |
+
src/matrix.h
|
| 30 |
+
src/meter.h
|
| 31 |
+
src/model.h
|
| 32 |
+
src/productquantizer.h
|
| 33 |
+
src/quantmatrix.h
|
| 34 |
+
src/real.h
|
| 35 |
+
src/utils.h
|
| 36 |
+
src/vector.h)
|
| 37 |
+
|
| 38 |
+
set(SOURCE_FILES
|
| 39 |
+
src/args.cc
|
| 40 |
+
src/autotune.cc
|
| 41 |
+
src/densematrix.cc
|
| 42 |
+
src/dictionary.cc
|
| 43 |
+
src/fasttext.cc
|
| 44 |
+
src/loss.cc
|
| 45 |
+
src/main.cc
|
| 46 |
+
src/matrix.cc
|
| 47 |
+
src/meter.cc
|
| 48 |
+
src/model.cc
|
| 49 |
+
src/productquantizer.cc
|
| 50 |
+
src/quantmatrix.cc
|
| 51 |
+
src/utils.cc
|
| 52 |
+
src/vector.cc)
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
if (NOT MSVC)
|
| 56 |
+
include(GNUInstallDirs)
|
| 57 |
+
configure_file("fasttext.pc.in" "fasttext.pc" @ONLY)
|
| 58 |
+
install(FILES "${CMAKE_BINARY_DIR}/fasttext.pc" DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig)
|
| 59 |
+
endif()
|
| 60 |
+
|
| 61 |
+
add_library(fasttext-shared SHARED ${SOURCE_FILES} ${HEADER_FILES})
|
| 62 |
+
add_library(fasttext-static STATIC ${SOURCE_FILES} ${HEADER_FILES})
|
| 63 |
+
add_library(fasttext-static_pic STATIC ${SOURCE_FILES} ${HEADER_FILES})
|
| 64 |
+
set_target_properties(fasttext-shared PROPERTIES OUTPUT_NAME fasttext
|
| 65 |
+
SOVERSION "${fasttext_VERSION_MAJOR}")
|
| 66 |
+
set_target_properties(fasttext-static PROPERTIES OUTPUT_NAME fasttext)
|
| 67 |
+
set_target_properties(fasttext-static_pic PROPERTIES OUTPUT_NAME fasttext_pic
|
| 68 |
+
POSITION_INDEPENDENT_CODE True)
|
| 69 |
+
add_executable(fasttext-bin src/main.cc)
|
| 70 |
+
target_link_libraries(fasttext-bin pthread fasttext-static)
|
| 71 |
+
set_target_properties(fasttext-bin PROPERTIES PUBLIC_HEADER "${HEADER_FILES}" OUTPUT_NAME fasttext)
|
| 72 |
+
install (TARGETS fasttext-shared
|
| 73 |
+
LIBRARY DESTINATION lib)
|
| 74 |
+
install (TARGETS fasttext-static
|
| 75 |
+
ARCHIVE DESTINATION lib)
|
| 76 |
+
install (TARGETS fasttext-static_pic
|
| 77 |
+
ARCHIVE DESTINATION lib)
|
| 78 |
+
install (TARGETS fasttext-bin
|
| 79 |
+
RUNTIME DESTINATION bin
|
| 80 |
+
PUBLIC_HEADER DESTINATION include/fasttext)
|
hf_demo/fastText/CODE_OF_CONDUCT.md
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Code of Conduct
|
| 2 |
+
|
| 3 |
+
## Our Pledge
|
| 4 |
+
|
| 5 |
+
In the interest of fostering an open and welcoming environment, we as
|
| 6 |
+
contributors and maintainers pledge to make participation in our project and
|
| 7 |
+
our community a harassment-free experience for everyone, regardless of age, body
|
| 8 |
+
size, disability, ethnicity, sex characteristics, gender identity and expression,
|
| 9 |
+
level of experience, education, socio-economic status, nationality, personal
|
| 10 |
+
appearance, race, religion, or sexual identity and orientation.
|
| 11 |
+
|
| 12 |
+
## Our Standards
|
| 13 |
+
|
| 14 |
+
Examples of behavior that contributes to creating a positive environment
|
| 15 |
+
include:
|
| 16 |
+
|
| 17 |
+
* Using welcoming and inclusive language
|
| 18 |
+
* Being respectful of differing viewpoints and experiences
|
| 19 |
+
* Gracefully accepting constructive criticism
|
| 20 |
+
* Focusing on what is best for the community
|
| 21 |
+
* Showing empathy towards other community members
|
| 22 |
+
|
| 23 |
+
Examples of unacceptable behavior by participants include:
|
| 24 |
+
|
| 25 |
+
* The use of sexualized language or imagery and unwelcome sexual attention or
|
| 26 |
+
advances
|
| 27 |
+
* Trolling, insulting/derogatory comments, and personal or political attacks
|
| 28 |
+
* Public or private harassment
|
| 29 |
+
* Publishing others' private information, such as a physical or electronic
|
| 30 |
+
address, without explicit permission
|
| 31 |
+
* Other conduct which could reasonably be considered inappropriate in a
|
| 32 |
+
professional setting
|
| 33 |
+
|
| 34 |
+
## Our Responsibilities
|
| 35 |
+
|
| 36 |
+
Project maintainers are responsible for clarifying the standards of acceptable
|
| 37 |
+
behavior and are expected to take appropriate and fair corrective action in
|
| 38 |
+
response to any instances of unacceptable behavior.
|
| 39 |
+
|
| 40 |
+
Project maintainers have the right and responsibility to remove, edit, or
|
| 41 |
+
reject comments, commits, code, wiki edits, issues, and other contributions
|
| 42 |
+
that are not aligned to this Code of Conduct, or to ban temporarily or
|
| 43 |
+
permanently any contributor for other behaviors that they deem inappropriate,
|
| 44 |
+
threatening, offensive, or harmful.
|
| 45 |
+
|
| 46 |
+
## Scope
|
| 47 |
+
|
| 48 |
+
This Code of Conduct applies within all project spaces, and it also applies when
|
| 49 |
+
an individual is representing the project or its community in public spaces.
|
| 50 |
+
Examples of representing a project or community include using an official
|
| 51 |
+
project e-mail address, posting via an official social media account, or acting
|
| 52 |
+
as an appointed representative at an online or offline event. Representation of
|
| 53 |
+
a project may be further defined and clarified by project maintainers.
|
| 54 |
+
|
| 55 |
+
## Enforcement
|
| 56 |
+
|
| 57 |
+
Instances of abusive, harassing, or otherwise unacceptable behavior may be
|
| 58 |
+
reported by contacting the project team at <opensource-conduct@fb.com>. All
|
| 59 |
+
complaints will be reviewed and investigated and will result in a response that
|
| 60 |
+
is deemed necessary and appropriate to the circumstances. The project team is
|
| 61 |
+
obligated to maintain confidentiality with regard to the reporter of an incident.
|
| 62 |
+
Further details of specific enforcement policies may be posted separately.
|
| 63 |
+
|
| 64 |
+
Project maintainers who do not follow or enforce the Code of Conduct in good
|
| 65 |
+
faith may face temporary or permanent repercussions as determined by other
|
| 66 |
+
members of the project's leadership.
|
| 67 |
+
|
| 68 |
+
## Attribution
|
| 69 |
+
|
| 70 |
+
This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
|
| 71 |
+
available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
|
| 72 |
+
|
| 73 |
+
[homepage]: https://www.contributor-covenant.org
|
| 74 |
+
|
| 75 |
+
For answers to common questions about this code of conduct, see
|
| 76 |
+
https://www.contributor-covenant.org/faq
|
| 77 |
+
|
hf_demo/fastText/CONTRIBUTING.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Contributing to fastText
|
| 2 |
+
We want to make contributing to this project as easy and transparent as possible.
|
| 3 |
+
|
| 4 |
+
## Issues
|
| 5 |
+
We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue.
|
| 6 |
+
|
| 7 |
+
### Reproducing issues
|
| 8 |
+
Please make sure that the issue you mention is not a result of one of the existing third-party libraries. For example, please do not post an issue if you encountered an error within a third-party Python library. We can only help you with errors which can be directly reproduced either with our C++ code or the corresponding Python bindings. If you do find an error, please post detailed steps to reproduce it. If we can't reproduce your error, we can't help you fix it.
|
| 9 |
+
|
| 10 |
+
## Pull Requests
|
| 11 |
+
Please post an Issue before submitting a pull request. This might save you some time as it is possible we can't support your contribution, albeit we try our best to accomodate your (planned) work and highly appreciate your time. Generally, it is best to have a pull request emerge from an issue rather than the other way around.
|
| 12 |
+
|
| 13 |
+
To create a pull request:
|
| 14 |
+
|
| 15 |
+
1. Fork the repo and create your branch from `master`.
|
| 16 |
+
2. If you've added code that should be tested, add tests.
|
| 17 |
+
3. If you've changed APIs, update the documentation.
|
| 18 |
+
4. Ensure the test suite passes.
|
| 19 |
+
5. Make sure your code lints.
|
| 20 |
+
6. If you haven't already, complete the Contributor License Agreement ("CLA").
|
| 21 |
+
|
| 22 |
+
## Tests
|
| 23 |
+
First, you will need to make sure you have the required data. For that, please have a look at the fetch_test_data.sh script under tests. Next run the tests using the runtests.py script passing a path to the directory containing the datasets.
|
| 24 |
+
|
| 25 |
+
## Contributor License Agreement ("CLA")
|
| 26 |
+
In order to accept your pull request, we need you to submit a CLA. You only need
|
| 27 |
+
to do this once to work on any of Facebook's open source projects.
|
| 28 |
+
|
| 29 |
+
Complete your CLA here: <https://code.facebook.com/cla>
|
| 30 |
+
|
| 31 |
+
## License
|
| 32 |
+
By contributing to fastText, you agree that your contributions will be licensed under its MIT license.
|
hf_demo/fastText/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
hf_demo/fastText/MANIFEST.in
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
include LICENSE
|
| 2 |
+
include PATENTS
|
| 3 |
+
|
| 4 |
+
recursive-include python *.md *.rst
|
| 5 |
+
recursive-include src *.h
|
hf_demo/fastText/Makefile
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#
|
| 2 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
#
|
| 8 |
+
|
| 9 |
+
CXX = c++
|
| 10 |
+
CXXFLAGS = -pthread -std=c++17 -march=native
|
| 11 |
+
OBJS = args.o autotune.o matrix.o dictionary.o loss.o productquantizer.o densematrix.o quantmatrix.o vector.o model.o utils.o meter.o fasttext.o
|
| 12 |
+
INCLUDES = -I.
|
| 13 |
+
|
| 14 |
+
opt: CXXFLAGS += -O3 -funroll-loops -DNDEBUG
|
| 15 |
+
opt: fasttext
|
| 16 |
+
|
| 17 |
+
coverage: CXXFLAGS += -O0 -fno-inline -fprofile-arcs --coverage
|
| 18 |
+
coverage: fasttext
|
| 19 |
+
|
| 20 |
+
debug: CXXFLAGS += -g -O0 -fno-inline
|
| 21 |
+
debug: fasttext
|
| 22 |
+
|
| 23 |
+
wasm: webassembly/fasttext_wasm.js
|
| 24 |
+
|
| 25 |
+
wasmdebug: export EMCC_DEBUG=1
|
| 26 |
+
wasmdebug: webassembly/fasttext_wasm.js
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
args.o: src/args.cc src/args.h
|
| 30 |
+
$(CXX) $(CXXFLAGS) -c src/args.cc
|
| 31 |
+
|
| 32 |
+
autotune.o: src/autotune.cc src/autotune.h
|
| 33 |
+
$(CXX) $(CXXFLAGS) -c src/autotune.cc
|
| 34 |
+
|
| 35 |
+
matrix.o: src/matrix.cc src/matrix.h
|
| 36 |
+
$(CXX) $(CXXFLAGS) -c src/matrix.cc
|
| 37 |
+
|
| 38 |
+
dictionary.o: src/dictionary.cc src/dictionary.h src/args.h
|
| 39 |
+
$(CXX) $(CXXFLAGS) -c src/dictionary.cc
|
| 40 |
+
|
| 41 |
+
loss.o: src/loss.cc src/loss.h src/matrix.h src/real.h
|
| 42 |
+
$(CXX) $(CXXFLAGS) -c src/loss.cc
|
| 43 |
+
|
| 44 |
+
productquantizer.o: src/productquantizer.cc src/productquantizer.h src/utils.h
|
| 45 |
+
$(CXX) $(CXXFLAGS) -c src/productquantizer.cc
|
| 46 |
+
|
| 47 |
+
densematrix.o: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
|
| 48 |
+
$(CXX) $(CXXFLAGS) -c src/densematrix.cc
|
| 49 |
+
|
| 50 |
+
quantmatrix.o: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
|
| 51 |
+
$(CXX) $(CXXFLAGS) -c src/quantmatrix.cc
|
| 52 |
+
|
| 53 |
+
vector.o: src/vector.cc src/vector.h src/utils.h
|
| 54 |
+
$(CXX) $(CXXFLAGS) -c src/vector.cc
|
| 55 |
+
|
| 56 |
+
model.o: src/model.cc src/model.h src/args.h
|
| 57 |
+
$(CXX) $(CXXFLAGS) -c src/model.cc
|
| 58 |
+
|
| 59 |
+
utils.o: src/utils.cc src/utils.h
|
| 60 |
+
$(CXX) $(CXXFLAGS) -c src/utils.cc
|
| 61 |
+
|
| 62 |
+
meter.o: src/meter.cc src/meter.h
|
| 63 |
+
$(CXX) $(CXXFLAGS) -c src/meter.cc
|
| 64 |
+
|
| 65 |
+
fasttext.o: src/fasttext.cc src/*.h
|
| 66 |
+
$(CXX) $(CXXFLAGS) -c src/fasttext.cc
|
| 67 |
+
|
| 68 |
+
fasttext: $(OBJS) src/fasttext.cc src/main.cc
|
| 69 |
+
$(CXX) $(CXXFLAGS) $(OBJS) src/main.cc -o fasttext
|
| 70 |
+
|
| 71 |
+
clean:
|
| 72 |
+
rm -rf *.o *.gcno *.gcda fasttext *.bc webassembly/fasttext_wasm.js webassembly/fasttext_wasm.wasm
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
EMCXX = em++
|
| 76 |
+
EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
|
| 77 |
+
EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
main.bc: webassembly/fasttext_wasm.cc
|
| 81 |
+
$(EMCXX) $(EMCXXFLAGS) webassembly/fasttext_wasm.cc -o main.bc
|
| 82 |
+
|
| 83 |
+
args.bc: src/args.cc src/args.h
|
| 84 |
+
$(EMCXX) $(EMCXXFLAGS) src/args.cc -o args.bc
|
| 85 |
+
|
| 86 |
+
autotune.bc: src/autotune.cc src/autotune.h
|
| 87 |
+
$(EMCXX) $(EMCXXFLAGS) src/autotune.cc -o autotune.bc
|
| 88 |
+
|
| 89 |
+
matrix.bc: src/matrix.cc src/matrix.h
|
| 90 |
+
$(EMCXX) $(EMCXXFLAGS) src/matrix.cc -o matrix.bc
|
| 91 |
+
|
| 92 |
+
dictionary.bc: src/dictionary.cc src/dictionary.h src/args.h
|
| 93 |
+
$(EMCXX) $(EMCXXFLAGS) src/dictionary.cc -o dictionary.bc
|
| 94 |
+
|
| 95 |
+
loss.bc: src/loss.cc src/loss.h src/matrix.h src/real.h
|
| 96 |
+
$(EMCXX) $(EMCXXFLAGS) src/loss.cc -o loss.bc
|
| 97 |
+
|
| 98 |
+
productquantizer.bc: src/productquantizer.cc src/productquantizer.h src/utils.h
|
| 99 |
+
$(EMCXX) $(EMCXXFLAGS) src/productquantizer.cc -o productquantizer.bc
|
| 100 |
+
|
| 101 |
+
densematrix.bc: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
|
| 102 |
+
$(EMCXX) $(EMCXXFLAGS) src/densematrix.cc -o densematrix.bc
|
| 103 |
+
|
| 104 |
+
quantmatrix.bc: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
|
| 105 |
+
$(EMCXX) $(EMCXXFLAGS) src/quantmatrix.cc -o quantmatrix.bc
|
| 106 |
+
|
| 107 |
+
vector.bc: src/vector.cc src/vector.h src/utils.h
|
| 108 |
+
$(EMCXX) $(EMCXXFLAGS) src/vector.cc -o vector.bc
|
| 109 |
+
|
| 110 |
+
model.bc: src/model.cc src/model.h src/args.h
|
| 111 |
+
$(EMCXX) $(EMCXXFLAGS) src/model.cc -o model.bc
|
| 112 |
+
|
| 113 |
+
utils.bc: src/utils.cc src/utils.h
|
| 114 |
+
$(EMCXX) $(EMCXXFLAGS) src/utils.cc -o utils.bc
|
| 115 |
+
|
| 116 |
+
meter.bc: src/meter.cc src/meter.h
|
| 117 |
+
$(EMCXX) $(EMCXXFLAGS) src/meter.cc -o meter.bc
|
| 118 |
+
|
| 119 |
+
fasttext.bc: src/fasttext.cc src/*.h
|
| 120 |
+
$(EMCXX) $(EMCXXFLAGS) src/fasttext.cc -o fasttext.bc
|
| 121 |
+
|
| 122 |
+
webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
|
| 123 |
+
$(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
|
| 124 |
+
|
| 125 |
+
|
hf_demo/fastText/PACKAGE
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
load("@fbcode_macros//build_defs:package_local_utils.bzl", "package_local_utils")
|
| 2 |
+
|
| 3 |
+
package_local_utils.set_clang_version(15, True)
|
hf_demo/fastText/README.md
ADDED
|
@@ -0,0 +1,339 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# fastText
|
| 2 |
+
[fastText](https://fasttext.cc/) is a library for efficient learning of word representations and sentence classification.
|
| 3 |
+
|
| 4 |
+
[](https://circleci.com/gh/facebookresearch/fastText/tree/master)
|
| 5 |
+
|
| 6 |
+
## Table of contents
|
| 7 |
+
|
| 8 |
+
* [Resources](#resources)
|
| 9 |
+
* [Models](#models)
|
| 10 |
+
* [Supplementary data](#supplementary-data)
|
| 11 |
+
* [FAQ](#faq)
|
| 12 |
+
* [Cheatsheet](#cheatsheet)
|
| 13 |
+
* [Requirements](#requirements)
|
| 14 |
+
* [Building fastText](#building-fasttext)
|
| 15 |
+
* [Getting the source code](#getting-the-source-code)
|
| 16 |
+
* [Building fastText using make (preferred)](#building-fasttext-using-make-preferred)
|
| 17 |
+
* [Building fastText using cmake](#building-fasttext-using-cmake)
|
| 18 |
+
* [Building fastText for Python](#building-fasttext-for-python)
|
| 19 |
+
* [Example use cases](#example-use-cases)
|
| 20 |
+
* [Word representation learning](#word-representation-learning)
|
| 21 |
+
* [Obtaining word vectors for out-of-vocabulary words](#obtaining-word-vectors-for-out-of-vocabulary-words)
|
| 22 |
+
* [Text classification](#text-classification)
|
| 23 |
+
* [Full documentation](#full-documentation)
|
| 24 |
+
* [References](#references)
|
| 25 |
+
* [Enriching Word Vectors with Subword Information](#enriching-word-vectors-with-subword-information)
|
| 26 |
+
* [Bag of Tricks for Efficient Text Classification](#bag-of-tricks-for-efficient-text-classification)
|
| 27 |
+
* [FastText.zip: Compressing text classification models](#fasttextzip-compressing-text-classification-models)
|
| 28 |
+
* [Join the fastText community](#join-the-fasttext-community)
|
| 29 |
+
* [License](#license)
|
| 30 |
+
|
| 31 |
+
## Resources
|
| 32 |
+
|
| 33 |
+
### Models
|
| 34 |
+
- Recent state-of-the-art [English word vectors](https://fasttext.cc/docs/en/english-vectors.html).
|
| 35 |
+
- Word vectors for [157 languages trained on Wikipedia and Crawl](https://github.com/facebookresearch/fastText/blob/master/docs/crawl-vectors.md).
|
| 36 |
+
- Models for [language identification](https://fasttext.cc/docs/en/language-identification.html#content) and [various supervised tasks](https://fasttext.cc/docs/en/supervised-models.html#content).
|
| 37 |
+
|
| 38 |
+
### Supplementary data
|
| 39 |
+
- The preprocessed [YFCC100M data](https://fasttext.cc/docs/en/dataset.html#content) used in [2].
|
| 40 |
+
|
| 41 |
+
### FAQ
|
| 42 |
+
|
| 43 |
+
You can find [answers to frequently asked questions](https://fasttext.cc/docs/en/faqs.html#content) on our [website](https://fasttext.cc/).
|
| 44 |
+
|
| 45 |
+
### Cheatsheet
|
| 46 |
+
|
| 47 |
+
We also provide a [cheatsheet](https://fasttext.cc/docs/en/cheatsheet.html#content) full of useful one-liners.
|
| 48 |
+
|
| 49 |
+
## Requirements
|
| 50 |
+
|
| 51 |
+
We are continuously building and testing our library, CLI and Python bindings under various docker images using [circleci](https://circleci.com/).
|
| 52 |
+
|
| 53 |
+
Generally, **fastText** builds on modern Mac OS and Linux distributions.
|
| 54 |
+
Since it uses some C++11 features, it requires a compiler with good C++11 support.
|
| 55 |
+
These include :
|
| 56 |
+
|
| 57 |
+
* (g++-4.7.2 or newer) or (clang-3.3 or newer)
|
| 58 |
+
|
| 59 |
+
Compilation is carried out using a Makefile, so you will need to have a working **make**.
|
| 60 |
+
If you want to use **cmake** you need at least version 2.8.9.
|
| 61 |
+
|
| 62 |
+
One of the oldest distributions we successfully built and tested the CLI under is [Debian jessie](https://www.debian.org/releases/jessie/).
|
| 63 |
+
|
| 64 |
+
For the word-similarity evaluation script you will need:
|
| 65 |
+
|
| 66 |
+
* Python 2.6 or newer
|
| 67 |
+
* NumPy & SciPy
|
| 68 |
+
|
| 69 |
+
For the python bindings (see the subdirectory python) you will need:
|
| 70 |
+
|
| 71 |
+
* Python version 2.7 or >=3.4
|
| 72 |
+
* NumPy & SciPy
|
| 73 |
+
* [pybind11](https://github.com/pybind/pybind11)
|
| 74 |
+
|
| 75 |
+
One of the oldest distributions we successfully built and tested the Python bindings under is [Debian jessie](https://www.debian.org/releases/jessie/).
|
| 76 |
+
|
| 77 |
+
If these requirements make it impossible for you to use fastText, please open an issue and we will try to accommodate you.
|
| 78 |
+
|
| 79 |
+
## Building fastText
|
| 80 |
+
|
| 81 |
+
We discuss building the latest stable version of fastText.
|
| 82 |
+
|
| 83 |
+
### Getting the source code
|
| 84 |
+
|
| 85 |
+
You can find our [latest stable release](https://github.com/facebookresearch/fastText/releases/latest) in the usual place.
|
| 86 |
+
|
| 87 |
+
There is also the master branch that contains all of our most recent work, but comes along with all the usual caveats of an unstable branch. You might want to use this if you are a developer or power-user.
|
| 88 |
+
|
| 89 |
+
### Building fastText using make (preferred)
|
| 90 |
+
|
| 91 |
+
```
|
| 92 |
+
$ wget https://github.com/facebookresearch/fastText/archive/v0.9.2.zip
|
| 93 |
+
$ unzip v0.9.2.zip
|
| 94 |
+
$ cd fastText-0.9.2
|
| 95 |
+
$ make
|
| 96 |
+
```
|
| 97 |
+
|
| 98 |
+
This will produce object files for all the classes as well as the main binary `fasttext`.
|
| 99 |
+
If you do not plan on using the default system-wide compiler, update the two macros defined at the beginning of the Makefile (CC and INCLUDES).
|
| 100 |
+
|
| 101 |
+
### Building fastText using cmake
|
| 102 |
+
|
| 103 |
+
For now this is not part of a release, so you will need to clone the master branch.
|
| 104 |
+
|
| 105 |
+
```
|
| 106 |
+
$ git clone https://github.com/facebookresearch/fastText.git
|
| 107 |
+
$ cd fastText
|
| 108 |
+
$ mkdir build && cd build && cmake ..
|
| 109 |
+
$ make && make install
|
| 110 |
+
```
|
| 111 |
+
|
| 112 |
+
This will create the fasttext binary and also all relevant libraries (shared, static, PIC).
|
| 113 |
+
|
| 114 |
+
### Building fastText for Python
|
| 115 |
+
|
| 116 |
+
For now this is not part of a release, so you will need to clone the master branch.
|
| 117 |
+
|
| 118 |
+
```
|
| 119 |
+
$ git clone https://github.com/facebookresearch/fastText.git
|
| 120 |
+
$ cd fastText
|
| 121 |
+
$ pip install .
|
| 122 |
+
```
|
| 123 |
+
|
| 124 |
+
For further information and introduction see python/README.md
|
| 125 |
+
|
| 126 |
+
## Example use cases
|
| 127 |
+
|
| 128 |
+
This library has two main use cases: word representation learning and text classification.
|
| 129 |
+
These were described in the two papers [1](#enriching-word-vectors-with-subword-information) and [2](#bag-of-tricks-for-efficient-text-classification).
|
| 130 |
+
|
| 131 |
+
### Word representation learning
|
| 132 |
+
|
| 133 |
+
In order to learn word vectors, as described in [1](#enriching-word-vectors-with-subword-information), do:
|
| 134 |
+
|
| 135 |
+
```
|
| 136 |
+
$ ./fasttext skipgram -input data.txt -output model
|
| 137 |
+
```
|
| 138 |
+
|
| 139 |
+
where `data.txt` is a training file containing `UTF-8` encoded text.
|
| 140 |
+
By default the word vectors will take into account character n-grams from 3 to 6 characters.
|
| 141 |
+
At the end of optimization the program will save two files: `model.bin` and `model.vec`.
|
| 142 |
+
`model.vec` is a text file containing the word vectors, one per line.
|
| 143 |
+
`model.bin` is a binary file containing the parameters of the model along with the dictionary and all hyper parameters.
|
| 144 |
+
The binary file can be used later to compute word vectors or to restart the optimization.
|
| 145 |
+
|
| 146 |
+
### Obtaining word vectors for out-of-vocabulary words
|
| 147 |
+
|
| 148 |
+
The previously trained model can be used to compute word vectors for out-of-vocabulary words.
|
| 149 |
+
Provided you have a text file `queries.txt` containing words for which you want to compute vectors, use the following command:
|
| 150 |
+
|
| 151 |
+
```
|
| 152 |
+
$ ./fasttext print-word-vectors model.bin < queries.txt
|
| 153 |
+
```
|
| 154 |
+
|
| 155 |
+
This will output word vectors to the standard output, one vector per line.
|
| 156 |
+
This can also be used with pipes:
|
| 157 |
+
|
| 158 |
+
```
|
| 159 |
+
$ cat queries.txt | ./fasttext print-word-vectors model.bin
|
| 160 |
+
```
|
| 161 |
+
|
| 162 |
+
See the provided scripts for an example. For instance, running:
|
| 163 |
+
|
| 164 |
+
```
|
| 165 |
+
$ ./word-vector-example.sh
|
| 166 |
+
```
|
| 167 |
+
|
| 168 |
+
will compile the code, download data, compute word vectors and evaluate them on the rare words similarity dataset RW [Thang et al. 2013].
|
| 169 |
+
|
| 170 |
+
### Text classification
|
| 171 |
+
|
| 172 |
+
This library can also be used to train supervised text classifiers, for instance for sentiment analysis.
|
| 173 |
+
In order to train a text classifier using the method described in [2](#bag-of-tricks-for-efficient-text-classification), use:
|
| 174 |
+
|
| 175 |
+
```
|
| 176 |
+
$ ./fasttext supervised -input train.txt -output model
|
| 177 |
+
```
|
| 178 |
+
|
| 179 |
+
where `train.txt` is a text file containing a training sentence per line along with the labels.
|
| 180 |
+
By default, we assume that labels are words that are prefixed by the string `__label__`.
|
| 181 |
+
This will output two files: `model.bin` and `model.vec`.
|
| 182 |
+
Once the model was trained, you can evaluate it by computing the precision and recall at k (P@k and R@k) on a test set using:
|
| 183 |
+
|
| 184 |
+
```
|
| 185 |
+
$ ./fasttext test model.bin test.txt k
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
The argument `k` is optional, and is equal to `1` by default.
|
| 189 |
+
|
| 190 |
+
In order to obtain the k most likely labels for a piece of text, use:
|
| 191 |
+
|
| 192 |
+
```
|
| 193 |
+
$ ./fasttext predict model.bin test.txt k
|
| 194 |
+
```
|
| 195 |
+
|
| 196 |
+
or use `predict-prob` to also get the probability for each label
|
| 197 |
+
|
| 198 |
+
```
|
| 199 |
+
$ ./fasttext predict-prob model.bin test.txt k
|
| 200 |
+
```
|
| 201 |
+
|
| 202 |
+
where `test.txt` contains a piece of text to classify per line.
|
| 203 |
+
Doing so will print to the standard output the k most likely labels for each line.
|
| 204 |
+
The argument `k` is optional, and equal to `1` by default.
|
| 205 |
+
See `classification-example.sh` for an example use case.
|
| 206 |
+
In order to reproduce results from the paper [2](#bag-of-tricks-for-efficient-text-classification), run `classification-results.sh`, this will download all the datasets and reproduce the results from Table 1.
|
| 207 |
+
|
| 208 |
+
If you want to compute vector representations of sentences or paragraphs, please use:
|
| 209 |
+
|
| 210 |
+
```
|
| 211 |
+
$ ./fasttext print-sentence-vectors model.bin < text.txt
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
This assumes that the `text.txt` file contains the paragraphs that you want to get vectors for.
|
| 215 |
+
The program will output one vector representation per line in the file.
|
| 216 |
+
|
| 217 |
+
You can also quantize a supervised model to reduce its memory usage with the following command:
|
| 218 |
+
|
| 219 |
+
```
|
| 220 |
+
$ ./fasttext quantize -output model
|
| 221 |
+
```
|
| 222 |
+
This will create a `.ftz` file with a smaller memory footprint. All the standard functionality, like `test` or `predict` work the same way on the quantized models:
|
| 223 |
+
```
|
| 224 |
+
$ ./fasttext test model.ftz test.txt
|
| 225 |
+
```
|
| 226 |
+
The quantization procedure follows the steps described in [3](#fasttextzip-compressing-text-classification-models). You can
|
| 227 |
+
run the script `quantization-example.sh` for an example.
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
## Full documentation
|
| 231 |
+
|
| 232 |
+
Invoke a command without arguments to list available arguments and their default values:
|
| 233 |
+
|
| 234 |
+
```
|
| 235 |
+
$ ./fasttext supervised
|
| 236 |
+
Empty input or output path.
|
| 237 |
+
|
| 238 |
+
The following arguments are mandatory:
|
| 239 |
+
-input training file path
|
| 240 |
+
-output output file path
|
| 241 |
+
|
| 242 |
+
The following arguments are optional:
|
| 243 |
+
-verbose verbosity level [2]
|
| 244 |
+
|
| 245 |
+
The following arguments for the dictionary are optional:
|
| 246 |
+
-minCount minimal number of word occurrences [1]
|
| 247 |
+
-minCountLabel minimal number of label occurrences [0]
|
| 248 |
+
-wordNgrams max length of word ngram [1]
|
| 249 |
+
-bucket number of buckets [2000000]
|
| 250 |
+
-minn min length of char ngram [0]
|
| 251 |
+
-maxn max length of char ngram [0]
|
| 252 |
+
-t sampling threshold [0.0001]
|
| 253 |
+
-label labels prefix [__label__]
|
| 254 |
+
|
| 255 |
+
The following arguments for training are optional:
|
| 256 |
+
-lr learning rate [0.1]
|
| 257 |
+
-lrUpdateRate change the rate of updates for the learning rate [100]
|
| 258 |
+
-dim size of word vectors [100]
|
| 259 |
+
-ws size of the context window [5]
|
| 260 |
+
-epoch number of epochs [5]
|
| 261 |
+
-neg number of negatives sampled [5]
|
| 262 |
+
-loss loss function {ns, hs, softmax} [softmax]
|
| 263 |
+
-thread number of threads [12]
|
| 264 |
+
-pretrainedVectors pretrained word vectors for supervised learning []
|
| 265 |
+
-saveOutput whether output params should be saved [0]
|
| 266 |
+
|
| 267 |
+
The following arguments for quantization are optional:
|
| 268 |
+
-cutoff number of words and ngrams to retain [0]
|
| 269 |
+
-retrain finetune embeddings if a cutoff is applied [0]
|
| 270 |
+
-qnorm quantizing the norm separately [0]
|
| 271 |
+
-qout quantizing the classifier [0]
|
| 272 |
+
-dsub size of each sub-vector [2]
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
Defaults may vary by mode. (Word-representation modes `skipgram` and `cbow` use a default `-minCount` of 5.)
|
| 276 |
+
|
| 277 |
+
## References
|
| 278 |
+
|
| 279 |
+
Please cite [1](#enriching-word-vectors-with-subword-information) if using this code for learning word representations or [2](#bag-of-tricks-for-efficient-text-classification) if using for text classification.
|
| 280 |
+
|
| 281 |
+
### Enriching Word Vectors with Subword Information
|
| 282 |
+
|
| 283 |
+
[1] P. Bojanowski\*, E. Grave\*, A. Joulin, T. Mikolov, [*Enriching Word Vectors with Subword Information*](https://arxiv.org/abs/1607.04606)
|
| 284 |
+
|
| 285 |
+
```
|
| 286 |
+
@article{bojanowski2017enriching,
|
| 287 |
+
title={Enriching Word Vectors with Subword Information},
|
| 288 |
+
author={Bojanowski, Piotr and Grave, Edouard and Joulin, Armand and Mikolov, Tomas},
|
| 289 |
+
journal={Transactions of the Association for Computational Linguistics},
|
| 290 |
+
volume={5},
|
| 291 |
+
year={2017},
|
| 292 |
+
issn={2307-387X},
|
| 293 |
+
pages={135--146}
|
| 294 |
+
}
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
### Bag of Tricks for Efficient Text Classification
|
| 298 |
+
|
| 299 |
+
[2] A. Joulin, E. Grave, P. Bojanowski, T. Mikolov, [*Bag of Tricks for Efficient Text Classification*](https://arxiv.org/abs/1607.01759)
|
| 300 |
+
|
| 301 |
+
```
|
| 302 |
+
@InProceedings{joulin2017bag,
|
| 303 |
+
title={Bag of Tricks for Efficient Text Classification},
|
| 304 |
+
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Mikolov, Tomas},
|
| 305 |
+
booktitle={Proceedings of the 15th Conference of the European Chapter of the Association for Computational Linguistics: Volume 2, Short Papers},
|
| 306 |
+
month={April},
|
| 307 |
+
year={2017},
|
| 308 |
+
publisher={Association for Computational Linguistics},
|
| 309 |
+
pages={427--431},
|
| 310 |
+
}
|
| 311 |
+
```
|
| 312 |
+
|
| 313 |
+
### FastText.zip: Compressing text classification models
|
| 314 |
+
|
| 315 |
+
[3] A. Joulin, E. Grave, P. Bojanowski, M. Douze, H. Jégou, T. Mikolov, [*FastText.zip: Compressing text classification models*](https://arxiv.org/abs/1612.03651)
|
| 316 |
+
|
| 317 |
+
```
|
| 318 |
+
@article{joulin2016fasttext,
|
| 319 |
+
title={FastText.zip: Compressing text classification models},
|
| 320 |
+
author={Joulin, Armand and Grave, Edouard and Bojanowski, Piotr and Douze, Matthijs and J{\'e}gou, H{\'e}rve and Mikolov, Tomas},
|
| 321 |
+
journal={arXiv preprint arXiv:1612.03651},
|
| 322 |
+
year={2016}
|
| 323 |
+
}
|
| 324 |
+
```
|
| 325 |
+
|
| 326 |
+
(\* These authors contributed equally.)
|
| 327 |
+
|
| 328 |
+
|
| 329 |
+
## Join the fastText community
|
| 330 |
+
|
| 331 |
+
* Facebook page: https://www.facebook.com/groups/1174547215919768
|
| 332 |
+
* Google group: https://groups.google.com/forum/#!forum/fasttext-library
|
| 333 |
+
* Contact: [egrave@fb.com](mailto:egrave@fb.com), [bojanowski@fb.com](mailto:bojanowski@fb.com), [ajoulin@fb.com](mailto:ajoulin@fb.com), [tmikolov@fb.com](mailto:tmikolov@fb.com)
|
| 334 |
+
|
| 335 |
+
See the CONTRIBUTING file for information about how to help out.
|
| 336 |
+
|
| 337 |
+
## License
|
| 338 |
+
|
| 339 |
+
fastText is MIT-licensed.
|
hf_demo/fastText/alignment/README.md
ADDED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Alignment of Word Embeddings
|
| 2 |
+
|
| 3 |
+
This directory provides code for learning alignments between word embeddings in different languages.
|
| 4 |
+
|
| 5 |
+
The code is in Python 3 and requires [NumPy](http://www.numpy.org/).
|
| 6 |
+
|
| 7 |
+
The script `example.sh` shows how to use this code to learn and evaluate a bilingual alignment of word embeddings.
|
| 8 |
+
|
| 9 |
+
The word embeddings used in [1] can be found on the [fastText project page](https://fasttext.cc) and the supervised bilingual lexicons on the [MUSE project page](https://github.com/facebookresearch/MUSE).
|
| 10 |
+
|
| 11 |
+
### Supervised alignment
|
| 12 |
+
|
| 13 |
+
The script `align.py` aligns word embeddings from two languages using a bilingual lexicon as supervision.
|
| 14 |
+
The details of this approach can be found in [1].
|
| 15 |
+
|
| 16 |
+
### Unsupervised alignment
|
| 17 |
+
|
| 18 |
+
The script `unsup_align.py` aligns word embeddings from two languages without requiring any supervision.
|
| 19 |
+
Additionally, the script `unsup_multialign.py` aligns multiple languages to a common space with no supervision.
|
| 20 |
+
The details of these approaches can be found in [2] and [3] respectively.
|
| 21 |
+
|
| 22 |
+
In addition to NumPy, the unsupervised methods require the [Python Optimal Transport](https://pot.readthedocs.io/en/stable/) toolbox.
|
| 23 |
+
|
| 24 |
+
### Download
|
| 25 |
+
|
| 26 |
+
Wikipedia fastText embeddings aligned with our method can be found [here](https://fasttext.cc/docs/en/aligned-vectors.html).
|
| 27 |
+
|
| 28 |
+
### References
|
| 29 |
+
|
| 30 |
+
If you use the supervised alignment method, please cite:
|
| 31 |
+
|
| 32 |
+
[1] A. Joulin, P. Bojanowski, T. Mikolov, H. Jegou, E. Grave, [*Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion*](https://arxiv.org/abs/1804.07745)
|
| 33 |
+
|
| 34 |
+
```
|
| 35 |
+
@InProceedings{joulin2018loss,
|
| 36 |
+
title={Loss in Translation: Learning Bilingual Word Mapping with a Retrieval Criterion},
|
| 37 |
+
author={Joulin, Armand and Bojanowski, Piotr and Mikolov, Tomas and J\'egou, Herv\'e and Grave, Edouard},
|
| 38 |
+
year={2018},
|
| 39 |
+
booktitle={Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
|
| 40 |
+
}
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
If you use the unsupervised bilingual alignment method, please cite:
|
| 44 |
+
|
| 45 |
+
[2] E. Grave, A. Joulin, Q. Berthet, [*Unsupervised Alignment of Embeddings with Wasserstein Procrustes*](https://arxiv.org/abs/1805.11222)
|
| 46 |
+
|
| 47 |
+
```
|
| 48 |
+
@article{grave2018unsupervised,
|
| 49 |
+
title={Unsupervised Alignment of Embeddings with Wasserstein Procrustes},
|
| 50 |
+
author={Grave, Edouard and Joulin, Armand and Berthet, Quentin},
|
| 51 |
+
journal={arXiv preprint arXiv:1805.11222},
|
| 52 |
+
year={2018}
|
| 53 |
+
}
|
| 54 |
+
```
|
| 55 |
+
|
| 56 |
+
If you use the unsupervised alignment script `unsup_multialign.py`, please cite:
|
| 57 |
+
|
| 58 |
+
[3] J. Alaux, E. Grave, M. Cuturi, A. Joulin, [*Unsupervised Hyperalignment for Multilingual Word Embeddings*](https://arxiv.org/abs/1811.01124)
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
@article{alaux2018unsupervised,
|
| 62 |
+
title={Unsupervised hyperalignment for multilingual word embeddings},
|
| 63 |
+
author={Alaux, Jean and Grave, Edouard and Cuturi, Marco and Joulin, Armand},
|
| 64 |
+
journal={arXiv preprint arXiv:1811.01124},
|
| 65 |
+
year={2018}
|
| 66 |
+
}
|
| 67 |
+
```
|
hf_demo/fastText/alignment/align.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
#
|
| 4 |
+
# Copyright (c) 2018-present, Facebook, Inc.
|
| 5 |
+
# All rights reserved.
|
| 6 |
+
#
|
| 7 |
+
# This source code is licensed under the license found in the
|
| 8 |
+
# LICENSE file in the root directory of this source tree.
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import argparse
|
| 12 |
+
from utils import *
|
| 13 |
+
import sys
|
| 14 |
+
|
| 15 |
+
parser = argparse.ArgumentParser(description='RCSLS for supervised word alignment')
|
| 16 |
+
|
| 17 |
+
parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
|
| 18 |
+
parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
|
| 19 |
+
parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
|
| 20 |
+
|
| 21 |
+
parser.add_argument("--dico_train", type=str, default='', help="train dictionary")
|
| 22 |
+
parser.add_argument("--dico_test", type=str, default='', help="validation dictionary")
|
| 23 |
+
|
| 24 |
+
parser.add_argument("--output", type=str, default='', help="where to save aligned embeddings")
|
| 25 |
+
|
| 26 |
+
parser.add_argument("--knn", type=int, default=10, help="number of nearest neighbors in RCSL/CSLS")
|
| 27 |
+
parser.add_argument("--maxneg", type=int, default=200000, help="Maximum number of negatives for the Extended RCSLS")
|
| 28 |
+
parser.add_argument("--maxsup", type=int, default=-1, help="Maximum number of training examples")
|
| 29 |
+
parser.add_argument("--maxload", type=int, default=200000, help="Maximum number of loaded vectors")
|
| 30 |
+
|
| 31 |
+
parser.add_argument("--model", type=str, default="none", help="Set of constraints: spectral or none")
|
| 32 |
+
parser.add_argument("--reg", type=float, default=0.0 , help='regularization parameters')
|
| 33 |
+
|
| 34 |
+
parser.add_argument("--lr", type=float, default=1.0, help='learning rate')
|
| 35 |
+
parser.add_argument("--niter", type=int, default=10, help='number of iterations')
|
| 36 |
+
parser.add_argument('--sgd', action='store_true', help='use sgd')
|
| 37 |
+
parser.add_argument("--batchsize", type=int, default=10000, help="batch size for sgd")
|
| 38 |
+
|
| 39 |
+
params = parser.parse_args()
|
| 40 |
+
|
| 41 |
+
###### SPECIFIC FUNCTIONS ######
|
| 42 |
+
# functions specific to RCSLS
|
| 43 |
+
# the rest of the functions are in utils.py
|
| 44 |
+
|
| 45 |
+
def getknn(sc, x, y, k=10):
|
| 46 |
+
sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
|
| 47 |
+
ytopk = y[sidx.flatten(), :]
|
| 48 |
+
ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
|
| 49 |
+
f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
|
| 50 |
+
df = np.dot(ytopk.sum(1).T, x)
|
| 51 |
+
return f / k, df / k
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, knn=10):
|
| 55 |
+
X_trans = np.dot(X_src, R.T)
|
| 56 |
+
f = 2 * np.sum(X_trans * Y_tgt)
|
| 57 |
+
df = 2 * np.dot(Y_tgt.T, X_src)
|
| 58 |
+
fk0, dfk0 = getknn(np.dot(X_trans, Z_tgt.T), X_src, Z_tgt, knn)
|
| 59 |
+
fk1, dfk1 = getknn(np.dot(np.dot(Z_src, R.T), Y_tgt.T).T, Y_tgt, Z_src, knn)
|
| 60 |
+
f = f - fk0 -fk1
|
| 61 |
+
df = df - dfk0 - dfk1.T
|
| 62 |
+
return -f / X_src.shape[0], -df / X_src.shape[0]
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def proj_spectral(R):
|
| 66 |
+
U, s, V = np.linalg.svd(R)
|
| 67 |
+
s[s > 1] = 1
|
| 68 |
+
s[s < 0] = 0
|
| 69 |
+
return np.dot(U, np.dot(np.diag(s), V))
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
###### MAIN ######
|
| 73 |
+
|
| 74 |
+
# load word embeddings
|
| 75 |
+
words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
|
| 76 |
+
words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
|
| 77 |
+
|
| 78 |
+
# load validation bilingual lexicon
|
| 79 |
+
src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
|
| 80 |
+
|
| 81 |
+
# word --> vector indices
|
| 82 |
+
idx_src = idx(words_src)
|
| 83 |
+
idx_tgt = idx(words_tgt)
|
| 84 |
+
|
| 85 |
+
# load train bilingual lexicon
|
| 86 |
+
pairs = load_pairs(params.dico_train, idx_src, idx_tgt)
|
| 87 |
+
if params.maxsup > 0 and params.maxsup < len(pairs):
|
| 88 |
+
pairs = pairs[:params.maxsup]
|
| 89 |
+
|
| 90 |
+
# selecting training vector pairs
|
| 91 |
+
X_src, Y_tgt = select_vectors_from_pairs(x_src, x_tgt, pairs)
|
| 92 |
+
|
| 93 |
+
# adding negatives for RCSLS
|
| 94 |
+
Z_src = x_src[:params.maxneg, :]
|
| 95 |
+
Z_tgt = x_tgt[:params.maxneg, :]
|
| 96 |
+
|
| 97 |
+
# initialization:
|
| 98 |
+
R = procrustes(X_src, Y_tgt)
|
| 99 |
+
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
|
| 100 |
+
print("[init -- Procrustes] NN: %.4f"%(nnacc))
|
| 101 |
+
sys.stdout.flush()
|
| 102 |
+
|
| 103 |
+
# optimization
|
| 104 |
+
fold, Rold = 0, []
|
| 105 |
+
niter, lr = params.niter, params.lr
|
| 106 |
+
|
| 107 |
+
for it in range(0, niter + 1):
|
| 108 |
+
if lr < 1e-4:
|
| 109 |
+
break
|
| 110 |
+
|
| 111 |
+
if params.sgd:
|
| 112 |
+
indices = np.random.choice(X_src.shape[0], size=params.batchsize, replace=False)
|
| 113 |
+
f, df = rcsls(X_src[indices, :], Y_tgt[indices, :], Z_src, Z_tgt, R, params.knn)
|
| 114 |
+
else:
|
| 115 |
+
f, df = rcsls(X_src, Y_tgt, Z_src, Z_tgt, R, params.knn)
|
| 116 |
+
|
| 117 |
+
if params.reg > 0:
|
| 118 |
+
R *= (1 - lr * params.reg)
|
| 119 |
+
R -= lr * df
|
| 120 |
+
if params.model == "spectral":
|
| 121 |
+
R = proj_spectral(R)
|
| 122 |
+
|
| 123 |
+
print("[it=%d] f = %.4f" % (it, f))
|
| 124 |
+
sys.stdout.flush()
|
| 125 |
+
|
| 126 |
+
if f > fold and it > 0 and not params.sgd:
|
| 127 |
+
lr /= 2
|
| 128 |
+
f, R = fold, Rold
|
| 129 |
+
|
| 130 |
+
fold, Rold = f, R
|
| 131 |
+
|
| 132 |
+
if (it > 0 and it % 10 == 0) or it == niter:
|
| 133 |
+
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
|
| 134 |
+
print("[it=%d] NN = %.4f - Coverage = %.4f" % (it, nnacc, len(src2tgt) / lexicon_size))
|
| 135 |
+
|
| 136 |
+
nnacc = compute_nn_accuracy(np.dot(x_src, R.T), x_tgt, src2tgt, lexicon_size=lexicon_size)
|
| 137 |
+
print("[final] NN = %.4f - Coverage = %.4f" % (nnacc, len(src2tgt) / lexicon_size))
|
| 138 |
+
|
| 139 |
+
if params.output != "":
|
| 140 |
+
print("Saving all aligned vectors at %s" % params.output)
|
| 141 |
+
words_full, x_full = load_vectors(params.src_emb, maxload=-1, center=params.center, verbose=False)
|
| 142 |
+
x = np.dot(x_full, R.T)
|
| 143 |
+
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
|
| 144 |
+
save_vectors(params.output, x, words_full)
|
| 145 |
+
save_matrix(params.output + "-mat", R)
|
hf_demo/fastText/alignment/eval.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
#
|
| 4 |
+
# Copyright (c) 2018-present, Facebook, Inc.
|
| 5 |
+
# All rights reserved.
|
| 6 |
+
#
|
| 7 |
+
# This source code is licensed under the license found in the
|
| 8 |
+
# LICENSE file in the root directory of this source tree.
|
| 9 |
+
|
| 10 |
+
import io
|
| 11 |
+
import numpy as np
|
| 12 |
+
import argparse
|
| 13 |
+
from utils import *
|
| 14 |
+
|
| 15 |
+
parser = argparse.ArgumentParser(description='Evaluation of word alignment')
|
| 16 |
+
parser.add_argument("--src_emb", type=str, default='', help="Load source embeddings")
|
| 17 |
+
parser.add_argument("--tgt_emb", type=str, default='', help="Load target embeddings")
|
| 18 |
+
parser.add_argument('--center', action='store_true', help='whether to center embeddings or not')
|
| 19 |
+
parser.add_argument("--src_mat", type=str, default='', help="Load source alignment matrix. If none given, the aligment matrix is the identity.")
|
| 20 |
+
parser.add_argument("--tgt_mat", type=str, default='', help="Load target alignment matrix. If none given, the aligment matrix is the identity.")
|
| 21 |
+
parser.add_argument("--dico_test", type=str, default='', help="test dictionary")
|
| 22 |
+
parser.add_argument("--maxload", type=int, default=200000)
|
| 23 |
+
parser.add_argument("--nomatch", action='store_true', help="no exact match in lexicon")
|
| 24 |
+
params = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
###### SPECIFIC FUNCTIONS ######
|
| 28 |
+
# function specific to evaluation
|
| 29 |
+
# the rest of the functions are in utils.py
|
| 30 |
+
|
| 31 |
+
def load_transform(fname, d1=300, d2=300):
|
| 32 |
+
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
|
| 33 |
+
R = np.zeros([d1, d2])
|
| 34 |
+
for i, line in enumerate(fin):
|
| 35 |
+
tokens = line.split(' ')
|
| 36 |
+
R[i, :] = np.array(tokens[0:d2], dtype=float)
|
| 37 |
+
return R
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
###### MAIN ######
|
| 41 |
+
|
| 42 |
+
print("Evaluation of alignment on %s" % params.dico_test)
|
| 43 |
+
if params.nomatch:
|
| 44 |
+
print("running without exact string matches")
|
| 45 |
+
|
| 46 |
+
words_tgt, x_tgt = load_vectors(params.tgt_emb, maxload=params.maxload, center=params.center)
|
| 47 |
+
words_src, x_src = load_vectors(params.src_emb, maxload=params.maxload, center=params.center)
|
| 48 |
+
|
| 49 |
+
if params.tgt_mat != "":
|
| 50 |
+
R_tgt = load_transform(params.tgt_mat)
|
| 51 |
+
x_tgt = np.dot(x_tgt, R_tgt)
|
| 52 |
+
if params.src_mat != "":
|
| 53 |
+
R_src = load_transform(params.src_mat)
|
| 54 |
+
x_src = np.dot(x_src, R_src)
|
| 55 |
+
|
| 56 |
+
src2tgt, lexicon_size = load_lexicon(params.dico_test, words_src, words_tgt)
|
| 57 |
+
|
| 58 |
+
nnacc = compute_nn_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
|
| 59 |
+
cslsproc = compute_csls_accuracy(x_src, x_tgt, src2tgt, lexicon_size=lexicon_size)
|
| 60 |
+
print("NN = %.4f - CSLS = %.4f - Coverage = %.4f" % (nnacc, cslsproc, len(src2tgt) / lexicon_size))
|
hf_demo/fastText/alignment/example.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/usr/env sh
|
| 2 |
+
# Copyright (c) 2018-present, Facebook, Inc.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
set -e
|
| 9 |
+
s=${1:-en}
|
| 10 |
+
t=${2:-es}
|
| 11 |
+
echo "Example based on the ${s}->${t} alignment"
|
| 12 |
+
|
| 13 |
+
if [ ! -d data/ ]; then
|
| 14 |
+
mkdir -p data;
|
| 15 |
+
fi
|
| 16 |
+
|
| 17 |
+
if [ ! -d res/ ]; then
|
| 18 |
+
mkdir -p res;
|
| 19 |
+
fi
|
| 20 |
+
|
| 21 |
+
dico_train=data/${s}-${t}.0-5000.txt
|
| 22 |
+
if [ ! -f "${dico_train}" ]; then
|
| 23 |
+
DICO=$(basename -- "${dico_train}")
|
| 24 |
+
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
|
| 25 |
+
fi
|
| 26 |
+
|
| 27 |
+
dico_test=data/${s}-${t}.5000-6500.txt
|
| 28 |
+
if [ ! -f "${dico_test}" ]; then
|
| 29 |
+
DICO=$(basename -- "${dico_test}")
|
| 30 |
+
wget -c "https://dl.fbaipublicfiles.com/arrival/dictionaries/${DICO}" -P data/
|
| 31 |
+
fi
|
| 32 |
+
|
| 33 |
+
src_emb=data/wiki.${s}.vec
|
| 34 |
+
if [ ! -f "${src_emb}" ]; then
|
| 35 |
+
EMB=$(basename -- "${src_emb}")
|
| 36 |
+
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
|
| 37 |
+
fi
|
| 38 |
+
|
| 39 |
+
tgt_emb=data/wiki.${t}.vec
|
| 40 |
+
if [ ! -f "${tgt_emb}" ]; then
|
| 41 |
+
EMB=$(basename -- "${tgt_emb}")
|
| 42 |
+
wget -c "https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/${EMB}" -P data/
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
output=res/wiki.${s}-${t}.vec
|
| 46 |
+
|
| 47 |
+
python3 align.py --src_emb "${src_emb}" --tgt_emb "${tgt_emb}" \
|
| 48 |
+
--dico_train "${dico_train}" --dico_test "${dico_test}" --output "${output}" \
|
| 49 |
+
--lr 25 --niter 10
|
| 50 |
+
python3 eval.py --src_emb "${output}" --tgt_emb "${tgt_emb}" \
|
| 51 |
+
--dico_test "${dico_test}"
|
hf_demo/fastText/alignment/unsup_align.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2018-present, Facebook, Inc.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the MIT license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import codecs, sys, time, math, argparse, ot
|
| 9 |
+
import numpy as np
|
| 10 |
+
from utils import *
|
| 11 |
+
|
| 12 |
+
parser = argparse.ArgumentParser(description='Wasserstein Procrustes for Embedding Alignment')
|
| 13 |
+
parser.add_argument('--model_src', type=str, help='Path to source word embeddings')
|
| 14 |
+
parser.add_argument('--model_tgt', type=str, help='Path to target word embeddings')
|
| 15 |
+
parser.add_argument('--lexicon', type=str, help='Path to the evaluation lexicon')
|
| 16 |
+
parser.add_argument('--output_src', default='', type=str, help='Path to save the aligned source embeddings')
|
| 17 |
+
parser.add_argument('--output_tgt', default='', type=str, help='Path to save the aligned target embeddings')
|
| 18 |
+
parser.add_argument('--seed', default=1111, type=int, help='Random number generator seed')
|
| 19 |
+
parser.add_argument('--nepoch', default=5, type=int, help='Number of epochs')
|
| 20 |
+
parser.add_argument('--niter', default=5000, type=int, help='Initial number of iterations')
|
| 21 |
+
parser.add_argument('--bsz', default=500, type=int, help='Initial batch size')
|
| 22 |
+
parser.add_argument('--lr', default=500., type=float, help='Learning rate')
|
| 23 |
+
parser.add_argument('--nmax', default=20000, type=int, help='Vocabulary size for learning the alignment')
|
| 24 |
+
parser.add_argument('--reg', default=0.05, type=float, help='Regularization parameter for sinkhorn')
|
| 25 |
+
args = parser.parse_args()
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def objective(X, Y, R, n=5000):
|
| 29 |
+
Xn, Yn = X[:n], Y[:n]
|
| 30 |
+
C = -np.dot(np.dot(Xn, R), Yn.T)
|
| 31 |
+
P = ot.sinkhorn(np.ones(n), np.ones(n), C, 0.025, stopThr=1e-3)
|
| 32 |
+
return 1000 * np.linalg.norm(np.dot(Xn, R) - np.dot(P, Yn)) / n
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def sqrt_eig(x):
|
| 36 |
+
U, s, VT = np.linalg.svd(x, full_matrices=False)
|
| 37 |
+
return np.dot(U, np.dot(np.diag(np.sqrt(s)), VT))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def align(X, Y, R, lr=10., bsz=200, nepoch=5, niter=1000,
|
| 41 |
+
nmax=10000, reg=0.05, verbose=True):
|
| 42 |
+
for epoch in range(1, nepoch + 1):
|
| 43 |
+
for _it in range(1, niter + 1):
|
| 44 |
+
# sample mini-batch
|
| 45 |
+
xt = X[np.random.permutation(nmax)[:bsz], :]
|
| 46 |
+
yt = Y[np.random.permutation(nmax)[:bsz], :]
|
| 47 |
+
# compute OT on minibatch
|
| 48 |
+
C = -np.dot(np.dot(xt, R), yt.T)
|
| 49 |
+
P = ot.sinkhorn(np.ones(bsz), np.ones(bsz), C, reg, stopThr=1e-3)
|
| 50 |
+
# compute gradient
|
| 51 |
+
G = - np.dot(xt.T, np.dot(P, yt))
|
| 52 |
+
R -= lr / bsz * G
|
| 53 |
+
# project on orthogonal matrices
|
| 54 |
+
U, s, VT = np.linalg.svd(R)
|
| 55 |
+
R = np.dot(U, VT)
|
| 56 |
+
bsz *= 2
|
| 57 |
+
niter //= 4
|
| 58 |
+
if verbose:
|
| 59 |
+
print("epoch: %d obj: %.3f" % (epoch, objective(X, Y, R)))
|
| 60 |
+
return R
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
|
| 64 |
+
n, d = X.shape
|
| 65 |
+
if apply_sqrt:
|
| 66 |
+
X, Y = sqrt_eig(X), sqrt_eig(Y)
|
| 67 |
+
K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
|
| 68 |
+
K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
|
| 69 |
+
K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
|
| 70 |
+
P = np.ones([n, n]) / float(n)
|
| 71 |
+
for it in range(1, niter + 1):
|
| 72 |
+
G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
|
| 73 |
+
q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
|
| 74 |
+
alpha = 2.0 / float(2.0 + it)
|
| 75 |
+
P = alpha * q + (1.0 - alpha) * P
|
| 76 |
+
obj = np.linalg.norm(np.dot(P, K_X) - np.dot(K_Y, P))
|
| 77 |
+
print(obj)
|
| 78 |
+
return procrustes(np.dot(P, X), Y).T
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
print("\n*** Wasserstein Procrustes ***\n")
|
| 82 |
+
|
| 83 |
+
np.random.seed(args.seed)
|
| 84 |
+
|
| 85 |
+
maxload = 200000
|
| 86 |
+
w_src, x_src = load_vectors(args.model_src, maxload, norm=True, center=True)
|
| 87 |
+
w_tgt, x_tgt = load_vectors(args.model_tgt, maxload, norm=True, center=True)
|
| 88 |
+
src2trg, _ = load_lexicon(args.lexicon, w_src, w_tgt)
|
| 89 |
+
|
| 90 |
+
print("\nComputing initial mapping with convex relaxation...")
|
| 91 |
+
t0 = time.time()
|
| 92 |
+
R0 = convex_init(x_src[:2500], x_tgt[:2500], reg=args.reg, apply_sqrt=True)
|
| 93 |
+
print("Done [%03d sec]" % math.floor(time.time() - t0))
|
| 94 |
+
|
| 95 |
+
print("\nComputing mapping with Wasserstein Procrustes...")
|
| 96 |
+
t0 = time.time()
|
| 97 |
+
R = align(x_src, x_tgt, R0.copy(), bsz=args.bsz, lr=args.lr, niter=args.niter,
|
| 98 |
+
nepoch=args.nepoch, reg=args.reg, nmax=args.nmax)
|
| 99 |
+
print("Done [%03d sec]" % math.floor(time.time() - t0))
|
| 100 |
+
|
| 101 |
+
acc = compute_nn_accuracy(x_src, np.dot(x_tgt, R.T), src2trg)
|
| 102 |
+
print("\nPrecision@1: %.3f\n" % acc)
|
| 103 |
+
|
| 104 |
+
if args.output_src != '':
|
| 105 |
+
x_src = x_src / np.linalg.norm(x_src, 2, 1).reshape([-1, 1])
|
| 106 |
+
save_vectors(args.output_src, x_src, w_src)
|
| 107 |
+
if args.output_tgt != '':
|
| 108 |
+
x_tgt = x_tgt / np.linalg.norm(x_tgt, 2, 1).reshape([-1, 1])
|
| 109 |
+
save_vectors(args.output_tgt, np.dot(x_tgt, R.T), w_tgt)
|
hf_demo/fastText/alignment/unsup_multialign.py
ADDED
|
@@ -0,0 +1,198 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# -*- coding: utf-8 -*-
|
| 3 |
+
#
|
| 4 |
+
# Copyright (c) 2019-present, Facebook, Inc.
|
| 5 |
+
# All rights reserved.
|
| 6 |
+
#
|
| 7 |
+
# This source code is licensed under the license found in the
|
| 8 |
+
# LICENSE file in the root directory of this source tree.
|
| 9 |
+
|
| 10 |
+
import io, os, ot, argparse, random
|
| 11 |
+
import numpy as np
|
| 12 |
+
from utils import *
|
| 13 |
+
|
| 14 |
+
parser = argparse.ArgumentParser(description=' ')
|
| 15 |
+
|
| 16 |
+
parser.add_argument('--embdir', default='data/', type=str)
|
| 17 |
+
parser.add_argument('--outdir', default='output/', type=str)
|
| 18 |
+
parser.add_argument('--lglist', default='en-fr-es-it-pt-de-pl-ru-da-nl-cs', type=str,
|
| 19 |
+
help='list of languages. The first element is the pivot. Example: en-fr-es to align English, French and Spanish with English as the pivot.')
|
| 20 |
+
|
| 21 |
+
parser.add_argument('--maxload', default=20000, type=int, help='Max number of loaded vectors')
|
| 22 |
+
parser.add_argument('--uniform', action='store_true', help='switch to uniform probability of picking language pairs')
|
| 23 |
+
|
| 24 |
+
# optimization parameters for the square loss
|
| 25 |
+
parser.add_argument('--epoch', default=2, type=int, help='nb of epochs for square loss')
|
| 26 |
+
parser.add_argument('--niter', default=500, type=int, help='max number of iteration per epoch for square loss')
|
| 27 |
+
parser.add_argument('--lr', default=0.1, type=float, help='learning rate for square loss')
|
| 28 |
+
parser.add_argument('--bsz', default=500, type=int, help='batch size for square loss')
|
| 29 |
+
|
| 30 |
+
# optimization parameters for the RCSLS loss
|
| 31 |
+
parser.add_argument('--altepoch', default=100, type=int, help='nb of epochs for RCSLS loss')
|
| 32 |
+
parser.add_argument('--altlr', default=25, type=float, help='learning rate for RCSLS loss')
|
| 33 |
+
parser.add_argument("--altbsz", type=int, default=1000, help="batch size for RCSLS")
|
| 34 |
+
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
|
| 37 |
+
###### SPECIFIC FUNCTIONS ######
|
| 38 |
+
|
| 39 |
+
def getknn(sc, x, y, k=10):
|
| 40 |
+
sidx = np.argpartition(sc, -k, axis=1)[:, -k:]
|
| 41 |
+
ytopk = y[sidx.flatten(), :]
|
| 42 |
+
ytopk = ytopk.reshape(sidx.shape[0], sidx.shape[1], y.shape[1])
|
| 43 |
+
f = np.sum(sc[np.arange(sc.shape[0])[:, None], sidx])
|
| 44 |
+
df = np.dot(ytopk.sum(1).T, x)
|
| 45 |
+
return f / k, df / k
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def rcsls(Xi, Xj, Zi, Zj, R, knn=10):
|
| 49 |
+
X_trans = np.dot(Xi, R.T)
|
| 50 |
+
f = 2 * np.sum(X_trans * Xj)
|
| 51 |
+
df = 2 * np.dot(Xj.T, Xi)
|
| 52 |
+
fk0, dfk0 = getknn(np.dot(X_trans, Zj.T), Xi, Zj, knn)
|
| 53 |
+
fk1, dfk1 = getknn(np.dot(np.dot(Zi, R.T), Xj.T).T, Xj, Zi, knn)
|
| 54 |
+
f = f - fk0 -fk1
|
| 55 |
+
df = df - dfk0 - dfk1.T
|
| 56 |
+
return -f / Xi.shape[0], -df.T / Xi.shape[0]
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def GWmatrix(emb0):
|
| 60 |
+
N = np.shape(emb0)[0]
|
| 61 |
+
N2 = .5* np.linalg.norm(emb0, axis=1).reshape(1, N)
|
| 62 |
+
C2 = np.tile(N2.transpose(), (1, N)) + np.tile(N2, (N, 1))
|
| 63 |
+
C2 -= np.dot(emb0,emb0.T)
|
| 64 |
+
return C2
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def gromov_wasserstein(x_src, x_tgt, C2):
|
| 68 |
+
N = x_src.shape[0]
|
| 69 |
+
C1 = GWmatrix(x_src)
|
| 70 |
+
M = ot.gromov_wasserstein(C1,C2,np.ones(N),np.ones(N),'square_loss',epsilon=0.55,max_iter=100,tol=1e-4)
|
| 71 |
+
return procrustes(np.dot(M,x_tgt), x_src)
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def align(EMB, TRANS, lglist, args):
|
| 75 |
+
nmax, l = args.maxload, len(lglist)
|
| 76 |
+
# create a list of language pairs to sample from
|
| 77 |
+
# (default == higher probability to pick a language pair contianing the pivot)
|
| 78 |
+
# if --uniform: uniform probability of picking a language pair
|
| 79 |
+
samples = []
|
| 80 |
+
for i in range(l):
|
| 81 |
+
for j in range(l):
|
| 82 |
+
if j == i :
|
| 83 |
+
continue
|
| 84 |
+
if j > 0 and args.uniform == False:
|
| 85 |
+
samples.append((0,j))
|
| 86 |
+
if i > 0 and args.uniform == False:
|
| 87 |
+
samples.append((i,0))
|
| 88 |
+
samples.append((i,j))
|
| 89 |
+
|
| 90 |
+
# optimization of the l2 loss
|
| 91 |
+
print('start optimizing L2 loss')
|
| 92 |
+
lr0, bsz, nepoch, niter = args.lr, args.bsz, args.epoch, args.niter
|
| 93 |
+
for epoch in range(nepoch):
|
| 94 |
+
print("start epoch %d / %d"%(epoch+1, nepoch))
|
| 95 |
+
ones = np.ones(bsz)
|
| 96 |
+
f, fold, nb, lr = 0.0, 0.0, 0.0, lr0
|
| 97 |
+
for it in range(niter):
|
| 98 |
+
if it > 1 and f > fold + 1e-3:
|
| 99 |
+
lr /= 2
|
| 100 |
+
if lr < .05:
|
| 101 |
+
break
|
| 102 |
+
fold = f
|
| 103 |
+
f, nb = 0.0, 0.0
|
| 104 |
+
for k in range(100 * (l-1)):
|
| 105 |
+
(i,j) = random.choice(samples)
|
| 106 |
+
embi = EMB[i][np.random.permutation(nmax)[:bsz], :]
|
| 107 |
+
embj = EMB[j][np.random.permutation(nmax)[:bsz], :]
|
| 108 |
+
perm = ot.sinkhorn(ones, ones, np.linalg.multi_dot([embi, -TRANS[i], TRANS[j].T,embj.T]), reg = 0.025, stopThr = 1e-3)
|
| 109 |
+
grad = np.linalg.multi_dot([embi.T, perm, embj])
|
| 110 |
+
f -= np.trace(np.linalg.multi_dot([TRANS[i].T, grad, TRANS[j]])) / embi.shape[0]
|
| 111 |
+
nb += 1
|
| 112 |
+
if i > 0:
|
| 113 |
+
TRANS[i] = proj_ortho(TRANS[i] + lr * np.dot(grad, TRANS[j]))
|
| 114 |
+
if j > 0:
|
| 115 |
+
TRANS[j] = proj_ortho(TRANS[j] + lr * np.dot(grad.transpose(), TRANS[i]))
|
| 116 |
+
print("iter %d / %d - epoch %d - loss: %.5f lr: %.4f" % (it, niter, epoch+1, f / nb , lr))
|
| 117 |
+
print("end of epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
|
| 118 |
+
niter, bsz = max(int(niter/2),2), min(1000, bsz * 2)
|
| 119 |
+
#end for epoch in range(nepoch):
|
| 120 |
+
|
| 121 |
+
# optimization of the RCSLS loss
|
| 122 |
+
print('start optimizing RCSLS loss')
|
| 123 |
+
f, fold, nb, lr = 0.0, 0.0, 0.0, args.altlr
|
| 124 |
+
for epoch in range(args.altepoch):
|
| 125 |
+
if epoch > 1 and f-fold > -1e-4 * abs(fold):
|
| 126 |
+
lr/= 2
|
| 127 |
+
if lr < 1e-1:
|
| 128 |
+
break
|
| 129 |
+
fold = f
|
| 130 |
+
f, nb = 0.0, 0.0
|
| 131 |
+
for k in range(round(nmax / args.altbsz) * 10 * (l-1)):
|
| 132 |
+
(i,j) = random.choice(samples)
|
| 133 |
+
sgdidx = np.random.choice(nmax, size=args.altbsz, replace=False)
|
| 134 |
+
embi = EMB[i][sgdidx, :]
|
| 135 |
+
embj = EMB[j][:nmax, :]
|
| 136 |
+
# crude alignment approximation:
|
| 137 |
+
T = np.dot(TRANS[i], TRANS[j].T)
|
| 138 |
+
scores = np.linalg.multi_dot([embi, T, embj.T])
|
| 139 |
+
perm = np.zeros_like(scores)
|
| 140 |
+
perm[np.arange(len(scores)), scores.argmax(1)] = 1
|
| 141 |
+
embj = np.dot(perm, embj)
|
| 142 |
+
# normalization over a subset of embeddings for speed up
|
| 143 |
+
fi, grad = rcsls(embi, embj, embi, embj, T.T)
|
| 144 |
+
f += fi
|
| 145 |
+
nb += 1
|
| 146 |
+
if i > 0:
|
| 147 |
+
TRANS[i] = proj_ortho(TRANS[i] - lr * np.dot(grad, TRANS[j]))
|
| 148 |
+
if j > 0:
|
| 149 |
+
TRANS[j] = proj_ortho(TRANS[j] - lr * np.dot(grad.transpose(), TRANS[i]))
|
| 150 |
+
print("epoch %d - loss: %.5f - lr: %.4f" % (epoch+1, f / max(nb,1), lr))
|
| 151 |
+
#end for epoch in range(args.altepoch):
|
| 152 |
+
return TRANS
|
| 153 |
+
|
| 154 |
+
def convex_init(X, Y, niter=100, reg=0.05, apply_sqrt=False):
|
| 155 |
+
n, d = X.shape
|
| 156 |
+
K_X, K_Y = np.dot(X, X.T), np.dot(Y, Y.T)
|
| 157 |
+
K_Y *= np.linalg.norm(K_X) / np.linalg.norm(K_Y)
|
| 158 |
+
K2_X, K2_Y = np.dot(K_X, K_X), np.dot(K_Y, K_Y)
|
| 159 |
+
P = np.ones([n, n]) / float(n)
|
| 160 |
+
for it in range(1, niter + 1):
|
| 161 |
+
G = np.dot(P, K2_X) + np.dot(K2_Y, P) - 2 * np.dot(K_Y, np.dot(P, K_X))
|
| 162 |
+
q = ot.sinkhorn(np.ones(n), np.ones(n), G, reg, stopThr=1e-3)
|
| 163 |
+
alpha = 2.0 / float(2.0 + it)
|
| 164 |
+
P = alpha * q + (1.0 - alpha) * P
|
| 165 |
+
return procrustes(np.dot(P, X), Y).T
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
###### MAIN ######
|
| 169 |
+
|
| 170 |
+
lglist = args.lglist.split('-')
|
| 171 |
+
l = len(lglist)
|
| 172 |
+
|
| 173 |
+
# embs:
|
| 174 |
+
EMB = {}
|
| 175 |
+
for i in range(l):
|
| 176 |
+
fn = args.embdir + '/wiki.' + lglist[i] + '.vec'
|
| 177 |
+
_, vecs = load_vectors(fn, maxload=args.maxload)
|
| 178 |
+
EMB[i] = vecs
|
| 179 |
+
|
| 180 |
+
#init
|
| 181 |
+
print("Computing initial bilingual apping with Gromov-Wasserstein...")
|
| 182 |
+
TRANS={}
|
| 183 |
+
maxinit = 2000
|
| 184 |
+
emb0 = EMB[0][:maxinit,:]
|
| 185 |
+
C0 = GWmatrix(emb0)
|
| 186 |
+
TRANS[0] = np.eye(300)
|
| 187 |
+
for i in range(1, l):
|
| 188 |
+
print("init "+lglist[i])
|
| 189 |
+
embi = EMB[i][:maxinit,:]
|
| 190 |
+
TRANS[i] = gromov_wasserstein(embi, emb0, C0)
|
| 191 |
+
|
| 192 |
+
# align
|
| 193 |
+
align(EMB, TRANS, lglist, args)
|
| 194 |
+
|
| 195 |
+
print('saving matrices in ' + args.outdir)
|
| 196 |
+
languages=''.join(lglist)
|
| 197 |
+
for i in range(l):
|
| 198 |
+
save_matrix(args.outdir + '/W-' + languages + '-' + lglist[i], TRANS[i])
|
hf_demo/fastText/alignment/utils.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
# Copyright (c) 2018-present, Facebook, Inc.
|
| 3 |
+
# All rights reserved.
|
| 4 |
+
#
|
| 5 |
+
# This source code is licensed under the license found in the
|
| 6 |
+
# LICENSE file in the root directory of this source tree.
|
| 7 |
+
|
| 8 |
+
import io
|
| 9 |
+
import numpy as np
|
| 10 |
+
import collections
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_vectors(fname, maxload=200000, norm=True, center=False, verbose=True):
|
| 14 |
+
if verbose:
|
| 15 |
+
print("Loading vectors from %s" % fname)
|
| 16 |
+
fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
|
| 17 |
+
n, d = map(int, fin.readline().split())
|
| 18 |
+
if maxload > 0:
|
| 19 |
+
n = min(n, maxload)
|
| 20 |
+
x = np.zeros([n, d])
|
| 21 |
+
words = []
|
| 22 |
+
for i, line in enumerate(fin):
|
| 23 |
+
if i >= n:
|
| 24 |
+
break
|
| 25 |
+
tokens = line.rstrip().split(' ')
|
| 26 |
+
words.append(tokens[0])
|
| 27 |
+
v = np.array(tokens[1:], dtype=float)
|
| 28 |
+
x[i, :] = v
|
| 29 |
+
if norm:
|
| 30 |
+
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
|
| 31 |
+
if center:
|
| 32 |
+
x -= x.mean(axis=0)[np.newaxis, :]
|
| 33 |
+
x /= np.linalg.norm(x, axis=1)[:, np.newaxis] + 1e-8
|
| 34 |
+
if verbose:
|
| 35 |
+
print("%d word vectors loaded" % (len(words)))
|
| 36 |
+
return words, x
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def idx(words):
|
| 40 |
+
w2i = {}
|
| 41 |
+
for i, w in enumerate(words):
|
| 42 |
+
if w not in w2i:
|
| 43 |
+
w2i[w] = i
|
| 44 |
+
return w2i
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def save_vectors(fname, x, words):
|
| 48 |
+
n, d = x.shape
|
| 49 |
+
fout = io.open(fname, 'w', encoding='utf-8')
|
| 50 |
+
fout.write(u"%d %d\n" % (n, d))
|
| 51 |
+
for i in range(n):
|
| 52 |
+
fout.write(words[i] + " " + " ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
|
| 53 |
+
fout.close()
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def save_matrix(fname, x):
|
| 57 |
+
n, d = x.shape
|
| 58 |
+
fout = io.open(fname, 'w', encoding='utf-8')
|
| 59 |
+
fout.write(u"%d %d\n" % (n, d))
|
| 60 |
+
for i in range(n):
|
| 61 |
+
fout.write(" ".join(map(lambda a: "%.4f" % a, x[i, :])) + "\n")
|
| 62 |
+
fout.close()
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def procrustes(X_src, Y_tgt):
|
| 66 |
+
U, s, V = np.linalg.svd(np.dot(Y_tgt.T, X_src))
|
| 67 |
+
return np.dot(U, V)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def select_vectors_from_pairs(x_src, y_tgt, pairs):
|
| 71 |
+
n = len(pairs)
|
| 72 |
+
d = x_src.shape[1]
|
| 73 |
+
x = np.zeros([n, d])
|
| 74 |
+
y = np.zeros([n, d])
|
| 75 |
+
for k, ij in enumerate(pairs):
|
| 76 |
+
i, j = ij
|
| 77 |
+
x[k, :] = x_src[i, :]
|
| 78 |
+
y[k, :] = y_tgt[j, :]
|
| 79 |
+
return x, y
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
def load_lexicon(filename, words_src, words_tgt, verbose=True):
|
| 83 |
+
f = io.open(filename, 'r', encoding='utf-8')
|
| 84 |
+
lexicon = collections.defaultdict(set)
|
| 85 |
+
idx_src , idx_tgt = idx(words_src), idx(words_tgt)
|
| 86 |
+
vocab = set()
|
| 87 |
+
for line in f:
|
| 88 |
+
word_src, word_tgt = line.split()
|
| 89 |
+
if word_src in idx_src and word_tgt in idx_tgt:
|
| 90 |
+
lexicon[idx_src[word_src]].add(idx_tgt[word_tgt])
|
| 91 |
+
vocab.add(word_src)
|
| 92 |
+
if verbose:
|
| 93 |
+
coverage = len(lexicon) / float(len(vocab))
|
| 94 |
+
print("Coverage of source vocab: %.4f" % (coverage))
|
| 95 |
+
return lexicon, float(len(vocab))
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def load_pairs(filename, idx_src, idx_tgt, verbose=True):
|
| 99 |
+
f = io.open(filename, 'r', encoding='utf-8')
|
| 100 |
+
pairs = []
|
| 101 |
+
tot = 0
|
| 102 |
+
for line in f:
|
| 103 |
+
a, b = line.rstrip().split(' ')
|
| 104 |
+
tot += 1
|
| 105 |
+
if a in idx_src and b in idx_tgt:
|
| 106 |
+
pairs.append((idx_src[a], idx_tgt[b]))
|
| 107 |
+
if verbose:
|
| 108 |
+
coverage = (1.0 * len(pairs)) / tot
|
| 109 |
+
print("Found pairs for training: %d - Total pairs in file: %d - Coverage of pairs: %.4f" % (len(pairs), tot, coverage))
|
| 110 |
+
return pairs
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def compute_nn_accuracy(x_src, x_tgt, lexicon, bsz=100, lexicon_size=-1):
|
| 114 |
+
if lexicon_size < 0:
|
| 115 |
+
lexicon_size = len(lexicon)
|
| 116 |
+
idx_src = list(lexicon.keys())
|
| 117 |
+
acc = 0.0
|
| 118 |
+
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
|
| 119 |
+
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
|
| 120 |
+
for i in range(0, len(idx_src), bsz):
|
| 121 |
+
e = min(i + bsz, len(idx_src))
|
| 122 |
+
scores = np.dot(x_tgt, x_src[idx_src[i:e]].T)
|
| 123 |
+
pred = scores.argmax(axis=0)
|
| 124 |
+
for j in range(i, e):
|
| 125 |
+
if pred[j - i] in lexicon[idx_src[j]]:
|
| 126 |
+
acc += 1.0
|
| 127 |
+
return acc / lexicon_size
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
def compute_csls_accuracy(x_src, x_tgt, lexicon, lexicon_size=-1, k=10, bsz=1024):
|
| 131 |
+
if lexicon_size < 0:
|
| 132 |
+
lexicon_size = len(lexicon)
|
| 133 |
+
idx_src = list(lexicon.keys())
|
| 134 |
+
|
| 135 |
+
x_src /= np.linalg.norm(x_src, axis=1)[:, np.newaxis] + 1e-8
|
| 136 |
+
x_tgt /= np.linalg.norm(x_tgt, axis=1)[:, np.newaxis] + 1e-8
|
| 137 |
+
|
| 138 |
+
sr = x_src[list(idx_src)]
|
| 139 |
+
sc = np.dot(sr, x_tgt.T)
|
| 140 |
+
similarities = 2 * sc
|
| 141 |
+
sc2 = np.zeros(x_tgt.shape[0])
|
| 142 |
+
for i in range(0, x_tgt.shape[0], bsz):
|
| 143 |
+
j = min(i + bsz, x_tgt.shape[0])
|
| 144 |
+
sc_batch = np.dot(x_tgt[i:j, :], x_src.T)
|
| 145 |
+
dotprod = np.partition(sc_batch, -k, axis=1)[:, -k:]
|
| 146 |
+
sc2[i:j] = np.mean(dotprod, axis=1)
|
| 147 |
+
similarities -= sc2[np.newaxis, :]
|
| 148 |
+
|
| 149 |
+
nn = np.argmax(similarities, axis=1).tolist()
|
| 150 |
+
correct = 0.0
|
| 151 |
+
for k in range(0, len(lexicon)):
|
| 152 |
+
if nn[k] in lexicon[idx_src[k]]:
|
| 153 |
+
correct += 1.0
|
| 154 |
+
return correct / lexicon_size
|
hf_demo/fastText/classification-example.sh
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
myshuf() {
|
| 11 |
+
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
normalize_text() {
|
| 15 |
+
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
|
| 16 |
+
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
|
| 17 |
+
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
| 18 |
+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
RESULTDIR=result
|
| 22 |
+
DATADIR=data
|
| 23 |
+
|
| 24 |
+
mkdir -p "${RESULTDIR}"
|
| 25 |
+
mkdir -p "${DATADIR}"
|
| 26 |
+
|
| 27 |
+
if [ ! -f "${DATADIR}/dbpedia.train" ]
|
| 28 |
+
then
|
| 29 |
+
wget -c "https://drive.google.com/uc?export=download&id=0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k" -O "${DATADIR}/dbpedia_csv.tar.gz"
|
| 30 |
+
tar -xzvf "${DATADIR}/dbpedia_csv.tar.gz" -C "${DATADIR}"
|
| 31 |
+
cat "${DATADIR}/dbpedia_csv/train.csv" | normalize_text > "${DATADIR}/dbpedia.train"
|
| 32 |
+
cat "${DATADIR}/dbpedia_csv/test.csv" | normalize_text > "${DATADIR}/dbpedia.test"
|
| 33 |
+
fi
|
| 34 |
+
|
| 35 |
+
make
|
| 36 |
+
|
| 37 |
+
./fasttext supervised -input "${DATADIR}/dbpedia.train" -output "${RESULTDIR}/dbpedia" -dim 10 -lr 0.1 -wordNgrams 2 -minCount 1 -bucket 10000000 -epoch 5 -thread 4
|
| 38 |
+
|
| 39 |
+
./fasttext test "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test"
|
| 40 |
+
|
| 41 |
+
./fasttext predict "${RESULTDIR}/dbpedia.bin" "${DATADIR}/dbpedia.test" > "${RESULTDIR}/dbpedia.test.predict"
|
hf_demo/fastText/classification-results.sh
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env bash
|
| 2 |
+
#
|
| 3 |
+
# Copyright (c) 2016-present, Facebook, Inc.
|
| 4 |
+
# All rights reserved.
|
| 5 |
+
#
|
| 6 |
+
# This source code is licensed under the MIT license found in the
|
| 7 |
+
# LICENSE file in the root directory of this source tree.
|
| 8 |
+
#
|
| 9 |
+
|
| 10 |
+
# This script produces the results from Table 1 in the following paper:
|
| 11 |
+
# Bag of Tricks for Efficient Text Classification, arXiv 1607.01759, 2016
|
| 12 |
+
|
| 13 |
+
myshuf() {
|
| 14 |
+
perl -MList::Util=shuffle -e 'print shuffle(<>);' "$@";
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
normalize_text() {
|
| 18 |
+
tr '[:upper:]' '[:lower:]' | sed -e 's/^/__label__/g' | \
|
| 19 |
+
sed -e "s/'/ ' /g" -e 's/"//g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' \
|
| 20 |
+
-e 's/,/ , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
|
| 21 |
+
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' | tr -s " " | myshuf
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
DATASET=(
|
| 25 |
+
ag_news
|
| 26 |
+
sogou_news
|
| 27 |
+
dbpedia
|
| 28 |
+
yelp_review_polarity
|
| 29 |
+
yelp_review_full
|
| 30 |
+
yahoo_answers
|
| 31 |
+
amazon_review_full
|
| 32 |
+
amazon_review_polarity
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
ID=(
|
| 36 |
+
0Bz8a_Dbh9QhbUDNpeUdjb0wxRms # ag_news
|
| 37 |
+
0Bz8a_Dbh9QhbUkVqNEszd0pHaFE # sogou_news
|
| 38 |
+
0Bz8a_Dbh9QhbQ2Vic1kxMmZZQ1k # dbpedia
|
| 39 |
+
0Bz8a_Dbh9QhbNUpYQ2N3SGlFaDg # yelp_review_polarity
|
| 40 |
+
0Bz8a_Dbh9QhbZlU4dXhHTFhZQU0 # yelp_review_full
|
| 41 |
+
0Bz8a_Dbh9Qhbd2JNdDBsQUdocVU # yahoo_answers
|
| 42 |
+
0Bz8a_Dbh9QhbZVhsUnRWRDhETzA # amazon_review_full
|
| 43 |
+
0Bz8a_Dbh9QhbaW12WVVZS2drcnM # amazon_review_polarity
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
# These learning rates were chosen by validation on a subset of the training set.
|
| 47 |
+
LR=( 0.25 0.5 0.5 0.1 0.1 0.1 0.05 0.05 )
|
| 48 |
+
|
| 49 |
+
RESULTDIR=result
|
| 50 |
+
DATADIR=data
|
| 51 |
+
|
| 52 |
+
mkdir -p "${RESULTDIR}"
|
| 53 |
+
mkdir -p "${DATADIR}"
|
| 54 |
+
|
| 55 |
+
# Small datasets first
|
| 56 |
+
|
| 57 |
+
for i in {0..0}
|
| 58 |
+
do
|
| 59 |
+
echo "Downloading dataset ${DATASET[i]}"
|
| 60 |
+
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
| 61 |
+
then
|
| 62 |
+
wget -c "https://drive.google.com/uc?export=download&id=${ID[i]}" -O "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
| 63 |
+
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
| 64 |
+
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
| 65 |
+
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|
| 66 |
+
fi
|
| 67 |
+
done
|
| 68 |
+
|
| 69 |
+
# Large datasets require a bit more work due to the extra request page
|
| 70 |
+
|
| 71 |
+
for i in {1..7}
|
| 72 |
+
do
|
| 73 |
+
echo "Downloading dataset ${DATASET[i]}"
|
| 74 |
+
if [ ! -f "${DATADIR}/${DATASET[i]}.train" ]
|
| 75 |
+
then
|
| 76 |
+
curl -c /tmp/cookies "https://drive.google.com/uc?export=download&id=${ID[i]}" > /tmp/intermezzo.html
|
| 77 |
+
curl -L -b /tmp/cookies "https://drive.google.com$(cat /tmp/intermezzo.html | grep -Po 'uc-download-link" [^>]* href="\K[^"]*' | sed 's/\&/\&/g')" > "${DATADIR}/${DATASET[i]}_csv.tar.gz"
|
| 78 |
+
tar -xzvf "${DATADIR}/${DATASET[i]}_csv.tar.gz" -C "${DATADIR}"
|
| 79 |
+
cat "${DATADIR}/${DATASET[i]}_csv/train.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.train"
|
| 80 |
+
cat "${DATADIR}/${DATASET[i]}_csv/test.csv" | normalize_text > "${DATADIR}/${DATASET[i]}.test"
|
| 81 |
+
fi
|
| 82 |
+
done
|
| 83 |
+
|
| 84 |
+
make
|
| 85 |
+
|
| 86 |
+
for i in {0..7}
|
| 87 |
+
do
|
| 88 |
+
echo "Working on dataset ${DATASET[i]}"
|
| 89 |
+
./fasttext supervised -input "${DATADIR}/${DATASET[i]}.train" \
|
| 90 |
+
-output "${RESULTDIR}/${DATASET[i]}" -dim 10 -lr "${LR[i]}" -wordNgrams 2 \
|
| 91 |
+
-minCount 1 -bucket 10000000 -epoch 5 -thread 4 > /dev/null
|
| 92 |
+
./fasttext test "${RESULTDIR}/${DATASET[i]}.bin" \
|
| 93 |
+
"${DATADIR}/${DATASET[i]}.test"
|
| 94 |
+
done
|
hf_demo/fastText/crawl/README.md
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Preprocessing Common Crawl
|
| 2 |
+
|
| 3 |
+
This code downloads, preprocesses and splits per language the data from [Common Crawl](http://commoncrawl.org/).
|
| 4 |
+
|
| 5 |
+
This script uses the scripts and language identifier of [1].
|
| 6 |
+
|
| 7 |
+
This code inherits its requirements form [fastText](https://github.com/facebookresearch/fastText).
|
| 8 |
+
|
| 9 |
+
Set the variable WET_PATHS_URL to the crawl you want to process.
|
| 10 |
+
Please also set the variables NUM_LANGID and NUM_DEDUP in `download_crawl.sh` according to the capacity of your machine.
|
| 11 |
+
Langid processes are mostly limited by CPU usage, while dedup processes are likely to be limited by RAM usage (each use 2GB of RAM).
|
| 12 |
+
|
| 13 |
+
### Reference
|
| 14 |
+
|
| 15 |
+
If you use this code, please cite:
|
| 16 |
+
|
| 17 |
+
[1] E. Grave*, P. Bojanowski*, P. Gupta, A. Joulin, T. Mikolov, [*Learning Word Vectors for 157 Languages*](https://arxiv.org/abs/1802.06893)
|
| 18 |
+
|
| 19 |
+
```
|
| 20 |
+
@inproceedings{grave2018learning,
|
| 21 |
+
title={Learning Word Vectors for 157 Languages},
|
| 22 |
+
author={Grave, Edouard and Bojanowski, Piotr and Gupta, Prakhar and Joulin, Armand and Mikolov, Tomas},
|
| 23 |
+
booktitle={Proceedings of the International Conference on Language Resources and Evaluation (LREC 2018)},
|
| 24 |
+
year={2018}
|
| 25 |
+
}
|
| 26 |
+
```
|