check-plurality
#5
by
ikarasz
- opened
utils.py
CHANGED
|
@@ -1,4 +1,8 @@
|
|
| 1 |
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 3 |
from torch import nn
|
| 4 |
from itertools import chain
|
|
@@ -9,6 +13,13 @@ import re
|
|
| 9 |
import string
|
| 10 |
import inflect
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
| 13 |
punct_chars.sort()
|
| 14 |
punctuation = ''.join(punct_chars)
|
|
@@ -825,6 +836,32 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
|
|
| 825 |
|
| 826 |
p = inflect.engine()
|
| 827 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 828 |
def singular_to_plural(word):
|
| 829 |
"""Convert singular words to plural using inflect."""
|
| 830 |
plural = p.plural(word)
|
|
@@ -832,9 +869,9 @@ def singular_to_plural(word):
|
|
| 832 |
|
| 833 |
def plural_to_singular(word):
|
| 834 |
"""Convert plural word to singular using inflect."""
|
| 835 |
-
if word
|
| 836 |
-
return word
|
| 837 |
-
return
|
| 838 |
|
| 839 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
| 840 |
|
|
|
|
| 1 |
import torch
|
| 2 |
+
import nltk
|
| 3 |
+
from nltk import pos_tag
|
| 4 |
+
from nltk.tokenize import word_tokenize
|
| 5 |
+
from nltk.corpus import wordnet
|
| 6 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 7 |
from torch import nn
|
| 8 |
from itertools import chain
|
|
|
|
| 13 |
import string
|
| 14 |
import inflect
|
| 15 |
|
| 16 |
+
nltk.download('punkt')
|
| 17 |
+
nltk.download('punkt_tab')
|
| 18 |
+
nltk.download('averaged_perceptron_tagger')
|
| 19 |
+
nltk.download('averaged_perceptron_tagger_eng')
|
| 20 |
+
nltk.download('wordnet')
|
| 21 |
+
# nltk.download('omw-1.4')
|
| 22 |
+
|
| 23 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
| 24 |
punct_chars.sort()
|
| 25 |
punctuation = ''.join(punct_chars)
|
|
|
|
| 836 |
|
| 837 |
p = inflect.engine()
|
| 838 |
|
| 839 |
+
def is_plural_regex(word):
|
| 840 |
+
"""Detect if a word is plural using common pluralization rules."""
|
| 841 |
+
# Check for common plural forms
|
| 842 |
+
return re.search(r'(s$|es$|ies$)', word.lower()) and not re.search(r'(ss$)', word.lower())
|
| 843 |
+
|
| 844 |
+
def is_plural_wordnet(word):
|
| 845 |
+
# Check if WordNet has both singular and plural forms
|
| 846 |
+
singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
|
| 847 |
+
plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
|
| 848 |
+
return len(plural_synsets) > len(singular_synsets)
|
| 849 |
+
|
| 850 |
+
def is_plural_pos(word):
|
| 851 |
+
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
| 852 |
+
# Tokenize the input word (necessary for NLTK tagging)
|
| 853 |
+
tokens = word_tokenize(word)
|
| 854 |
+
# Get the part-of-speech tag for the word
|
| 855 |
+
pos = pos_tag(tokens)[0][1]
|
| 856 |
+
# Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
|
| 857 |
+
return pos in ["NNS", "NNPS"]
|
| 858 |
+
|
| 859 |
+
def is_plural(word):
|
| 860 |
+
"""Check if a word is plural."""
|
| 861 |
+
if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
|
| 862 |
+
return False
|
| 863 |
+
return is_plural_regex(word) or is_plural_pos(word) or is_plural_wordnet(word)
|
| 864 |
+
|
| 865 |
def singular_to_plural(word):
|
| 866 |
"""Convert singular words to plural using inflect."""
|
| 867 |
plural = p.plural(word)
|
|
|
|
| 869 |
|
| 870 |
def plural_to_singular(word):
|
| 871 |
"""Convert plural word to singular using inflect."""
|
| 872 |
+
if is_plural(word):
|
| 873 |
+
return p.singular_noun(word) or word
|
| 874 |
+
return word
|
| 875 |
|
| 876 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
| 877 |
|