Spaces:
No application file
No application file
Upload errant_gec.py with huggingface_hub
Browse files- errant_gec.py +15 -4
errant_gec.py
CHANGED
|
@@ -138,12 +138,23 @@ class Errant(evaluate.Metric):
|
|
| 138 |
self._annotators[lang] = annotator
|
| 139 |
return annotator
|
| 140 |
|
| 141 |
-
def _get_edits(self, annotator, orig_doc, cor_doc):
|
| 142 |
"""Extract edits between original and corrected documents.
|
| 143 |
|
| 144 |
Returns a set of (o_start, o_end, o_str, c_str) tuples.
|
|
|
|
|
|
|
|
|
|
| 145 |
"""
|
| 146 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
edit_set = set()
|
| 148 |
for edit in edits:
|
| 149 |
# Skip noop edits (no actual change)
|
|
@@ -210,8 +221,8 @@ class Errant(evaluate.Metric):
|
|
| 210 |
ref_doc = annotator.parse(reference)
|
| 211 |
|
| 212 |
# Get edit sets
|
| 213 |
-
hyp_edits = self._get_edits(annotator, orig_doc, hyp_doc)
|
| 214 |
-
ref_edits = self._get_edits(annotator, orig_doc, ref_doc)
|
| 215 |
|
| 216 |
# Compute TP, FP, FN for this sample
|
| 217 |
tp = len(ref_edits & hyp_edits)
|
|
|
|
| 138 |
self._annotators[lang] = annotator
|
| 139 |
return annotator
|
| 140 |
|
| 141 |
+
def _get_edits(self, annotator, orig_doc, cor_doc, lang: str = "en"):
|
| 142 |
"""Extract edits between original and corrected documents.
|
| 143 |
|
| 144 |
Returns a set of (o_start, o_end, o_str, c_str) tuples.
|
| 145 |
+
|
| 146 |
+
For non-English languages, we skip classification since ERRANT's
|
| 147 |
+
classifier uses English-specific POS tag mappings.
|
| 148 |
"""
|
| 149 |
+
# Use align and merge without classification for non-English
|
| 150 |
+
# This matches the behavior of errant_parallel with -lev flag
|
| 151 |
+
alignment = annotator.align(orig_doc, cor_doc, lev=True)
|
| 152 |
+
edits = annotator.merge(alignment)
|
| 153 |
+
|
| 154 |
+
# Only classify for English (classifier uses English POS tags)
|
| 155 |
+
if lang == "en":
|
| 156 |
+
edits = [annotator.classify(edit) for edit in edits]
|
| 157 |
+
|
| 158 |
edit_set = set()
|
| 159 |
for edit in edits:
|
| 160 |
# Skip noop edits (no actual change)
|
|
|
|
| 221 |
ref_doc = annotator.parse(reference)
|
| 222 |
|
| 223 |
# Get edit sets
|
| 224 |
+
hyp_edits = self._get_edits(annotator, orig_doc, hyp_doc, lang)
|
| 225 |
+
ref_edits = self._get_edits(annotator, orig_doc, ref_doc, lang)
|
| 226 |
|
| 227 |
# Compute TP, FP, FN for this sample
|
| 228 |
tp = len(ref_edits & hyp_edits)
|