Update README.md
Browse files
README.md
CHANGED
|
@@ -51,27 +51,32 @@ In order to use SEC-BERT-NUM, you have to pre-process texts replacing every nume
|
|
| 51 |
Below there is an example how you can pre-process a simple sentence. This approach is quite simple, feel free to modify it as you see fit.
|
| 52 |
|
| 53 |
```python
|
|
|
|
| 54 |
import spacy
|
| 55 |
from transformers import AutoTokenizer
|
| 56 |
|
| 57 |
-
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-
|
| 58 |
spacy_tokenizer = spacy.load("en_core_web_sm")
|
| 59 |
|
| 60 |
sentence = "Total net sales decreased 2% or $5.4 billion during 2019 compared to 2018."
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
if
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
else:
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
|
| 74 |
-
tokenized_sentence = tokenizer.tokenize(
|
| 75 |
print(tokenized_sentence)
|
| 76 |
"""
|
| 77 |
['total', 'net', 'sales', 'decreased', '[X]', '%', 'or', '$', '[X.X]', 'billion', 'during', '[XXXX]', 'compared', 'to', '[XXXX]', '.']
|
|
|
|
| 51 |
Below there is an example how you can pre-process a simple sentence. This approach is quite simple, feel free to modify it as you see fit.
|
| 52 |
|
| 53 |
```python
|
| 54 |
+
import re
|
| 55 |
import spacy
|
| 56 |
from transformers import AutoTokenizer
|
| 57 |
|
| 58 |
+
tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-shape")
|
| 59 |
spacy_tokenizer = spacy.load("en_core_web_sm")
|
| 60 |
|
| 61 |
sentence = "Total net sales decreased 2% or $5.4 billion during 2019 compared to 2018."
|
| 62 |
+
|
| 63 |
+
def sec_bert_shape_preprocess(text):
|
| 64 |
+
tokens = [t.text for t in spacy_tokenizer(sentence)]
|
| 65 |
+
|
| 66 |
+
processed_text = []
|
| 67 |
+
for token in tokens:
|
| 68 |
+
if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", token):
|
| 69 |
+
shape = '[' + re.sub(r'\d', 'X', token) + ']'
|
| 70 |
+
if shape in tokenizer.additional_special_tokens:
|
| 71 |
+
processed_text.append(shape)
|
| 72 |
+
else:
|
| 73 |
+
processed_text.append('[NUM]')
|
| 74 |
else:
|
| 75 |
+
processed_text.append(token)
|
| 76 |
+
|
| 77 |
+
return ' '.join(processed_text)
|
| 78 |
|
| 79 |
+
tokenized_sentence = tokenizer.tokenize(sec_bert_shape_preprocess(sentence))
|
| 80 |
print(tokenized_sentence)
|
| 81 |
"""
|
| 82 |
['total', 'net', 'sales', 'decreased', '[X]', '%', 'or', '$', '[X.X]', 'billion', 'during', '[XXXX]', 'compared', 'to', '[XXXX]', '.']
|