check if word is plural
Browse files
utils.py
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
import torch
|
|
|
|
|
|
|
|
|
|
| 2 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 3 |
from torch import nn
|
| 4 |
from itertools import chain
|
|
@@ -9,6 +12,9 @@ import re
|
|
| 9 |
import string
|
| 10 |
import inflect
|
| 11 |
|
|
|
|
|
|
|
|
|
|
| 12 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
| 13 |
punct_chars.sort()
|
| 14 |
punctuation = ''.join(punct_chars)
|
|
@@ -825,6 +831,15 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
|
|
| 825 |
|
| 826 |
p = inflect.engine()
|
| 827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
def singular_to_plural(word):
|
| 829 |
"""Convert singular words to plural using inflect."""
|
| 830 |
plural = p.plural(word)
|
|
@@ -832,9 +847,9 @@ def singular_to_plural(word):
|
|
| 832 |
|
| 833 |
def plural_to_singular(word):
|
| 834 |
"""Convert plural word to singular using inflect."""
|
| 835 |
-
if word
|
| 836 |
-
return word
|
| 837 |
-
return
|
| 838 |
|
| 839 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
| 840 |
|
|
|
|
| 1 |
import torch
|
| 2 |
+
import nltk
|
| 3 |
+
from nltk import pos_tag
|
| 4 |
+
from nltk.tokenize import word_tokenize
|
| 5 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 6 |
from torch import nn
|
| 7 |
from itertools import chain
|
|
|
|
| 12 |
import string
|
| 13 |
import inflect
|
| 14 |
|
| 15 |
+
nltk.download("averaged_perceptron_tagger")
|
| 16 |
+
nltk.download("punkt")
|
| 17 |
+
|
| 18 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
| 19 |
punct_chars.sort()
|
| 20 |
punctuation = ''.join(punct_chars)
|
|
|
|
| 831 |
|
| 832 |
p = inflect.engine()
|
| 833 |
|
| 834 |
+
def is_plural(word):
|
| 835 |
+
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
| 836 |
+
# Tokenize the input word (necessary for NLTK tagging)
|
| 837 |
+
tokens = word_tokenize(word)
|
| 838 |
+
# Get the part-of-speech tag for the word
|
| 839 |
+
pos = pos_tag(tokens)[0][1]
|
| 840 |
+
# Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
|
| 841 |
+
return pos in ["NNS", "NNPS"]
|
| 842 |
+
|
| 843 |
def singular_to_plural(word):
|
| 844 |
"""Convert singular words to plural using inflect."""
|
| 845 |
plural = p.plural(word)
|
|
|
|
| 847 |
|
| 848 |
def plural_to_singular(word):
|
| 849 |
"""Convert plural word to singular using inflect."""
|
| 850 |
+
if is_plural(word):
|
| 851 |
+
return p.singular_noun(word) or word
|
| 852 |
+
return word
|
| 853 |
|
| 854 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
| 855 |
|