aimgo
/

Emendator

aimgo commited on 16 days ago

Commit

3cfc949

verified ·

1 Parent(s): 7552ad5

Update README.md

Files changed (1) hide show

README.md CHANGED Viewed

@@ -37,33 +37,31 @@ The model is intended to be used on segments of **250** characters. Anything els
 To use Emendator, you can load it via the Transformers library:
 ```python
 from transformers import T5ForConditionalGeneration, AutoTokenizer
-model_path = 'aimgo/Emendator'
-tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
 model.eval()
-enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(self.device)
-max_input_len = enc["input_ids"].shape[1]
-with torch.cuda.amp.autocast(dtype=torch.bfloat16, enabled=True):
     outputs = model.generate(
         enc["input_ids"],
         attention_mask=enc["attention_mask"],
-        max_new_tokens=max_input_len + 32,
         num_beams=4,
         do_sample=False,
         early_stopping=True,
         repetition_penalty=1.15,
-        use_cache=True,
     )
-outputs = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```
 If you use this in your work, please cite:
@@ -72,7 +70,7 @@ If you use this in your work, please cite:
   author       = {McCarthy, A. M.},
   title        = {{Emendator}: Latin OCR Artifact Correction},
   year         = {2026},
-  howpublished = {\url{https://huggingface.co/datasets/aimgo/Emendator}},
   note         = {Model}
 }
 ```

 To use Emendator, you can load it via the Transformers library:
 ```python
+import torch
 from transformers import T5ForConditionalGeneration, AutoTokenizer
+model_path = "aimgo/Emendator"
+device = "cuda" if torch.cuda.is_available() else "cpu"
+tokenizer = AutoTokenizer.from_pretrained(model_path)
 model = T5ForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16).to(device)
 model.eval()
+texts = ["Nil igirur rnors cft ad nos ncq;pcrtinct hilurn»", "Vt quod ali cibus eft aliis fuat acre uenenurn."]
+enc = tokenizer(texts, return_tensors="pt", padding=True, truncation=True, max_length=256).to(device)
+with torch.no_grad():
     outputs = model.generate(
         enc["input_ids"],
         attention_mask=enc["attention_mask"],
+        max_new_tokens=enc["input_ids"].shape[1] + 32,
         num_beams=4,
         do_sample=False,
         early_stopping=True,
         repetition_penalty=1.15,
     )
+corrected = tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```
 If you use this in your work, please cite:
   author       = {McCarthy, A. M.},
   title        = {{Emendator}: Latin OCR Artifact Correction},
   year         = {2026},
+  howpublished = {\url{https://huggingface.co/aimgo/Emendator}},
   note         = {Model}
 }
 ```