cnmoro commited on
Commit
fe73f95
·
verified ·
1 Parent(s): 489d90a

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +8 -1
README.md CHANGED
@@ -25,7 +25,7 @@ Concept:
25
  This will be trained for 2 epochs. The current model here is the first one.
26
 
27
  ```python
28
- import torch
29
  from transformers import AutoModel, AutoTokenizer
30
 
31
  model_name = "cnmoro/LexicalEmbed-Base"
@@ -34,7 +34,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_name)
34
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
35
  model.eval()
36
 
 
 
 
 
 
 
37
  texts = ["hello world", "hel wor"]
 
38
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
39
 
40
  with torch.no_grad():
 
25
  This will be trained for 2 epochs. The current model here is the first one.
26
 
27
  ```python
28
+ import torch, re, unicodedata
29
  from transformers import AutoModel, AutoTokenizer
30
 
31
  model_name = "cnmoro/LexicalEmbed-Base"
 
34
  model = AutoModel.from_pretrained(model_name, trust_remote_code=True)
35
  model.eval()
36
 
37
+ def preprocess(text):
38
+ text = unicodedata.normalize('NFD', text)
39
+ text = ''.join(c for c in text if unicodedata.category(c) != 'Mn')
40
+ text = re.sub(r'[^\w\s]+', ' ', text.lower())
41
+ return re.sub(r'\s+', ' ', text).strip()
42
+
43
  texts = ["hello world", "hel wor"]
44
+ texts = [ preprocess(s) for s in texts ]
45
  inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")
46
 
47
  with torch.no_grad():