aimgo commited on
Commit
3cfc949
·
verified ·
1 Parent(s): 7552ad5

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +12 -14
README.md CHANGED
@@ -37,33 +37,31 @@ The model is intended to be used on segments of **250** characters. Anything els
37
  To use Emendator, you can load it via the Transformers library:
38
 
39
  ```python
40
-
41
  from transformers import T5ForConditionalGeneration, AutoTokenizer
42
 
43
- model_path = 'aimgo/Emendator'
44
- tokenizer = AutoTokenizer.from_pretrained(model_path)
45
 
 
46
  model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
47
-
48
  model.eval()
49
 
50
- enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(self.device)
51
-
52
- max_input_len = enc["input_ids"].shape[1]
53
-
54
- with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
55
  outputs = model.generate(
56
  enc["input_ids"],
57
  attention_mask=enc["attention_mask"],
58
- max_new_tokens=max_input_len + 32,
59
  num_beams=4,
60
  do_sample=False,
61
  early_stopping=True,
62
  repetition_penalty=1.15,
63
- use_cache=True,
64
  )
65
-
66
- outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
67
  ```
68
 
69
  If you use this in your work, please cite:
@@ -72,7 +70,7 @@ If you use this in your work, please cite:
72
  author = {McCarthy, A. M.},
73
  title = {{Emendator}: Latin OCR Artifact Correction},
74
  year = {2026},
75
- howpublished = {\url{https://huggingface.co/datasets/aimgo/Emendator}},
76
  note = {Model}
77
  }
78
  ```
 
37
  To use Emendator, you can load it via the Transformers library:
38
 
39
  ```python
40
+ import torch
41
  from transformers import T5ForConditionalGeneration, AutoTokenizer
42
 
43
+ model_path = "aimgo/Emendator"
44
+ device = "cuda" if torch.cuda.is_available() else "cpu"
45
 
46
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
47
  model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
 
48
  model.eval()
49
 
50
+ texts = ["Nil igirur rnors cft ad nos ncq;pcrtinct hilurn»", "Vt quod ali cibus eft aliis fuat acre uenenurn."]
51
+ enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
52
+
53
+ with torch.no_grad():
 
54
  outputs = model.generate(
55
  enc["input_ids"],
56
  attention_mask=enc["attention_mask"],
57
+ max_new_tokens=enc["input_ids"].shape[1] + 32,
58
  num_beams=4,
59
  do_sample=False,
60
  early_stopping=True,
61
  repetition_penalty=1.15,
 
62
  )
63
+
64
+ corrected = tokenizer.batch_decode(outputs, skip_special_tokens=True)
65
  ```
66
 
67
  If you use this in your work, please cite:
 
70
  author = {McCarthy, A. M.},
71
  title = {{Emendator}: Latin OCR Artifact Correction},
72
  year = {2026},
73
+ howpublished = {\url{https://huggingface.co/aimgo/Emendator}},
74
  note = {Model}
75
  }
76
  ```