Update pipeline.py
Browse files- pipeline.py +10 -0
pipeline.py
CHANGED
|
@@ -654,15 +654,25 @@ class NormalisationPipeline(Pipeline):
|
|
| 654 |
for i in range(len(result)):
|
| 655 |
input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
|
| 656 |
input_sent = input_sent.replace('ſ' , 's')
|
|
|
|
|
|
|
| 657 |
if not self.no_post_clean:
|
| 658 |
pred_sent = self.post_cleaning(pred_sent)
|
| 659 |
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
| 660 |
|
|
|
|
| 661 |
if not self.no_postproc_lex:
|
| 662 |
alignment = self.postprocess_correct_sent(alignment)
|
|
|
|
|
|
|
| 663 |
pred_sent = self.get_pred_from_alignment(alignment)
|
|
|
|
|
|
|
| 664 |
if not self.no_post_clean:
|
| 665 |
pred_sent = self.post_cleaning(pred_sent)
|
|
|
|
|
|
|
|
|
|
| 666 |
char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
|
| 667 |
output.append({'text': pred_sent, 'alignment': char_spans})
|
| 668 |
return output
|
|
|
|
| 654 |
for i in range(len(result)):
|
| 655 |
input_sent, pred_sent = input_sents[i].strip(), result[i][0]['text'].strip()
|
| 656 |
input_sent = input_sent.replace('ſ' , 's')
|
| 657 |
+
|
| 658 |
+
# apply cleaning and get alignment (necessary for postprocessing w/ the lexicon)
|
| 659 |
if not self.no_post_clean:
|
| 660 |
pred_sent = self.post_cleaning(pred_sent)
|
| 661 |
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
| 662 |
|
| 663 |
+
# apply postprocessing w/ the lexicon to the sentence (using the alignment)
|
| 664 |
if not self.no_postproc_lex:
|
| 665 |
alignment = self.postprocess_correct_sent(alignment)
|
| 666 |
+
|
| 667 |
+
# get the predicted sentence from the alignment
|
| 668 |
pred_sent = self.get_pred_from_alignment(alignment)
|
| 669 |
+
|
| 670 |
+
# redo another round of cleaning and get the alignment again in case things have changed
|
| 671 |
if not self.no_post_clean:
|
| 672 |
pred_sent = self.post_cleaning(pred_sent)
|
| 673 |
+
alignment, pred_sent_tok = self.align(input_sent, pred_sent)
|
| 674 |
+
|
| 675 |
+
# get aligned character spans
|
| 676 |
char_spans = self.get_char_idx_align(input_sent, pred_sent, alignment)
|
| 677 |
output.append({'text': pred_sent, 'alignment': char_spans})
|
| 678 |
return output
|