add wordnet check for irregular plurals
Browse files
utils.py
CHANGED
|
@@ -2,6 +2,7 @@ import torch
|
|
| 2 |
import nltk
|
| 3 |
from nltk import pos_tag
|
| 4 |
from nltk.tokenize import word_tokenize
|
|
|
|
| 5 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 6 |
from torch import nn
|
| 7 |
from itertools import chain
|
|
@@ -16,7 +17,7 @@ nltk.download('punkt')
|
|
| 16 |
nltk.download('punkt_tab')
|
| 17 |
nltk.download('averaged_perceptron_tagger')
|
| 18 |
nltk.download('averaged_perceptron_tagger_eng')
|
| 19 |
-
|
| 20 |
# nltk.download('omw-1.4')
|
| 21 |
|
| 22 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
|
@@ -835,6 +836,12 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
|
|
| 835 |
|
| 836 |
p = inflect.engine()
|
| 837 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 838 |
def is_plural(word):
|
| 839 |
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
| 840 |
# Tokenize the input word (necessary for NLTK tagging)
|
|
@@ -853,6 +860,8 @@ def plural_to_singular(word):
|
|
| 853 |
"""Convert plural word to singular using inflect."""
|
| 854 |
if is_plural(word):
|
| 855 |
return p.singular_noun(word) or word
|
|
|
|
|
|
|
| 856 |
return word
|
| 857 |
|
| 858 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|
|
|
|
| 2 |
import nltk
|
| 3 |
from nltk import pos_tag
|
| 4 |
from nltk.tokenize import word_tokenize
|
| 5 |
+
from nltk.corpus import wordnet
|
| 6 |
from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
|
| 7 |
from torch import nn
|
| 8 |
from itertools import chain
|
|
|
|
| 17 |
nltk.download('punkt_tab')
|
| 18 |
nltk.download('averaged_perceptron_tagger')
|
| 19 |
nltk.download('averaged_perceptron_tagger_eng')
|
| 20 |
+
nltk.download('wordnet')
|
| 21 |
# nltk.download('omw-1.4')
|
| 22 |
|
| 23 |
punct_chars = list((set(string.punctuation) | {'β', 'β', 'β', 'β', '~', '|', 'β', 'β', 'β¦', "'", "`", '_'}))
|
|
|
|
| 836 |
|
| 837 |
p = inflect.engine()
|
| 838 |
|
| 839 |
+
def is_plural_wordnet(word):
|
| 840 |
+
# Check if WordNet has both singular and plural forms
|
| 841 |
+
singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
|
| 842 |
+
plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
|
| 843 |
+
return len(plural_synsets) > len(singular_synsets)
|
| 844 |
+
|
| 845 |
def is_plural(word):
|
| 846 |
"""Determine if a word is plural using NLTK's part-of-speech tagging."""
|
| 847 |
# Tokenize the input word (necessary for NLTK tagging)
|
|
|
|
| 860 |
"""Convert plural word to singular using inflect."""
|
| 861 |
if is_plural(word):
|
| 862 |
return p.singular_noun(word) or word
|
| 863 |
+
if is_plural_wordnet(word):
|
| 864 |
+
return p.singular_noun(word) or word
|
| 865 |
return word
|
| 866 |
|
| 867 |
plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]
|