Update README.md
Browse files
README.md
CHANGED
|
@@ -37,33 +37,31 @@ The model is intended to be used on segments of **250** characters. Anything els
|
|
| 37 |
To use Emendator, you can load it via the Transformers library:
|
| 38 |
|
| 39 |
```python
|
| 40 |
-
|
| 41 |
from transformers import T5ForConditionalGeneration, AutoTokenizer
|
| 42 |
|
| 43 |
-
model_path =
|
| 44 |
-
|
| 45 |
|
|
|
|
| 46 |
model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
|
| 47 |
-
|
| 48 |
model.eval()
|
| 49 |
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
|
| 55 |
outputs = model.generate(
|
| 56 |
enc["input_ids"],
|
| 57 |
attention_mask=enc["attention_mask"],
|
| 58 |
-
max_new_tokens=
|
| 59 |
num_beams=4,
|
| 60 |
do_sample=False,
|
| 61 |
early_stopping=True,
|
| 62 |
repetition_penalty=1.15,
|
| 63 |
-
use_cache=True,
|
| 64 |
)
|
| 65 |
-
|
| 66 |
-
|
| 67 |
```
|
| 68 |
|
| 69 |
If you use this in your work, please cite:
|
|
@@ -72,7 +70,7 @@ If you use this in your work, please cite:
|
|
| 72 |
author = {McCarthy, A. M.},
|
| 73 |
title = {{Emendator}: Latin OCR Artifact Correction},
|
| 74 |
year = {2026},
|
| 75 |
-
howpublished = {\url{https://huggingface.co/
|
| 76 |
note = {Model}
|
| 77 |
}
|
| 78 |
```
|
|
|
|
| 37 |
To use Emendator, you can load it via the Transformers library:
|
| 38 |
|
| 39 |
```python
|
| 40 |
+
import torch
|
| 41 |
from transformers import T5ForConditionalGeneration, AutoTokenizer
|
| 42 |
|
| 43 |
+
model_path = "aimgo/Emendator"
|
| 44 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 45 |
|
| 46 |
+
tokenizer = AutoTokenizer.from_pretrained(model_path)
|
| 47 |
model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
|
|
|
|
| 48 |
model.eval()
|
| 49 |
|
| 50 |
+
texts = ["Nil igirur rnors cft ad nos ncq;pcrtinct hilurn»", "Vt quod ali cibus eft aliis fuat acre uenenurn."]
|
| 51 |
+
enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
|
| 52 |
+
|
| 53 |
+
with torch.no_grad():
|
|
|
|
| 54 |
outputs = model.generate(
|
| 55 |
enc["input_ids"],
|
| 56 |
attention_mask=enc["attention_mask"],
|
| 57 |
+
max_new_tokens=enc["input_ids"].shape[1] + 32,
|
| 58 |
num_beams=4,
|
| 59 |
do_sample=False,
|
| 60 |
early_stopping=True,
|
| 61 |
repetition_penalty=1.15,
|
|
|
|
| 62 |
)
|
| 63 |
+
|
| 64 |
+
corrected = tokenizer.batch_decode(outputs, skip_special_tokens=True)
|
| 65 |
```
|
| 66 |
|
| 67 |
If you use this in your work, please cite:
|
|
|
|
| 70 |
author = {McCarthy, A. M.},
|
| 71 |
title = {{Emendator}: Latin OCR Artifact Correction},
|
| 72 |
year = {2026},
|
| 73 |
+
howpublished = {\url{https://huggingface.co/aimgo/Emendator}},
|
| 74 |
note = {Model}
|
| 75 |
}
|
| 76 |
```
|