marksverdhei commited on
Commit
6e6f86e
·
verified ·
1 Parent(s): 44444ec

Upload errant_gec.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. errant_gec.py +15 -4
errant_gec.py CHANGED
@@ -138,12 +138,23 @@ class Errant(evaluate.Metric):
138
  self._annotators[lang] = annotator
139
  return annotator
140
 
141
- def _get_edits(self, annotator, orig_doc, cor_doc):
142
  """Extract edits between original and corrected documents.
143
 
144
  Returns a set of (o_start, o_end, o_str, c_str) tuples.
 
 
 
145
  """
146
- edits = annotator.annotate(orig_doc, cor_doc)
 
 
 
 
 
 
 
 
147
  edit_set = set()
148
  for edit in edits:
149
  # Skip noop edits (no actual change)
@@ -210,8 +221,8 @@ class Errant(evaluate.Metric):
210
  ref_doc = annotator.parse(reference)
211
 
212
  # Get edit sets
213
- hyp_edits = self._get_edits(annotator, orig_doc, hyp_doc)
214
- ref_edits = self._get_edits(annotator, orig_doc, ref_doc)
215
 
216
  # Compute TP, FP, FN for this sample
217
  tp = len(ref_edits & hyp_edits)
 
138
  self._annotators[lang] = annotator
139
  return annotator
140
 
141
+ def _get_edits(self, annotator, orig_doc, cor_doc, lang: str = "en"):
142
  """Extract edits between original and corrected documents.
143
 
144
  Returns a set of (o_start, o_end, o_str, c_str) tuples.
145
+
146
+ For non-English languages, we skip classification since ERRANT's
147
+ classifier uses English-specific POS tag mappings.
148
  """
149
+ # Use align and merge without classification for non-English
150
+ # This matches the behavior of errant_parallel with -lev flag
151
+ alignment = annotator.align(orig_doc, cor_doc, lev=True)
152
+ edits = annotator.merge(alignment)
153
+
154
+ # Only classify for English (classifier uses English POS tags)
155
+ if lang == "en":
156
+ edits = [annotator.classify(edit) for edit in edits]
157
+
158
  edit_set = set()
159
  for edit in edits:
160
  # Skip noop edits (no actual change)
 
221
  ref_doc = annotator.parse(reference)
222
 
223
  # Get edit sets
224
+ hyp_edits = self._get_edits(annotator, orig_doc, hyp_doc, lang)
225
+ ref_edits = self._get_edits(annotator, orig_doc, ref_doc, lang)
226
 
227
  # Compute TP, FP, FN for this sample
228
  tp = len(ref_edits & hyp_edits)