nlpaueb commited on
Commit
e87cd29
·
1 Parent(s): f56c452

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +18 -13
README.md CHANGED
@@ -51,27 +51,32 @@ In order to use SEC-BERT-NUM, you have to pre-process texts replacing every nume
51
  Below there is an example how you can pre-process a simple sentence. This approach is quite simple, feel free to modify it as you see fit.
52
 
53
  ```python
 
54
  import spacy
55
  from transformers import AutoTokenizer
56
 
57
- tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-num")
58
  spacy_tokenizer = spacy.load("en_core_web_sm")
59
 
60
  sentence = "Total net sales decreased 2% or $5.4 billion during 2019 compared to 2018."
61
- tokens = [t.text for t in spacy_tokenizer(sentence)]
62
-
63
- processed_sentence = []
64
- for token in tokens:
65
- if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", token):
66
- shape = '[' + re.sub(r'\d', 'X', token) + ']'
67
- if shape in tokenizer.additional_special_tokens:
68
- processed_sentence.append(shape)
 
 
 
 
69
  else:
70
- processed_sentence.append('[NUM]')
71
- else:
72
- processed_sentence.append(token)
73
 
74
- tokenized_sentence = tokenizer.tokenize(' '.join(processed_sentence))
75
  print(tokenized_sentence)
76
  """
77
  ['total', 'net', 'sales', 'decreased', '[X]', '%', 'or', '$', '[X.X]', 'billion', 'during', '[XXXX]', 'compared', 'to', '[XXXX]', '.']
 
51
  Below there is an example how you can pre-process a simple sentence. This approach is quite simple, feel free to modify it as you see fit.
52
 
53
  ```python
54
+ import re
55
  import spacy
56
  from transformers import AutoTokenizer
57
 
58
+ tokenizer = AutoTokenizer.from_pretrained("nlpaueb/sec-bert-shape")
59
  spacy_tokenizer = spacy.load("en_core_web_sm")
60
 
61
  sentence = "Total net sales decreased 2% or $5.4 billion during 2019 compared to 2018."
62
+
63
+ def sec_bert_shape_preprocess(text):
64
+ tokens = [t.text for t in spacy_tokenizer(sentence)]
65
+
66
+ processed_text = []
67
+ for token in tokens:
68
+ if re.fullmatch(r"(\d+[\d,.]*)|([,.]\d+)", token):
69
+ shape = '[' + re.sub(r'\d', 'X', token) + ']'
70
+ if shape in tokenizer.additional_special_tokens:
71
+ processed_text.append(shape)
72
+ else:
73
+ processed_text.append('[NUM]')
74
  else:
75
+ processed_text.append(token)
76
+
77
+ return ' '.join(processed_text)
78
 
79
+ tokenized_sentence = tokenizer.tokenize(sec_bert_shape_preprocess(sentence))
80
  print(tokenized_sentence)
81
  """
82
  ['total', 'net', 'sales', 'decreased', '[X]', '%', 'or', '$', '[X.X]', 'billion', 'during', '[XXXX]', 'compared', 'to', '[XXXX]', '.']