stanford-nlpxed
/

transcript-analysis

ikarasz commited on Mar 22, 2025

Commit

f0b228b

1 Parent(s): aa6c0b3

add wordnet check for irregular plurals

Files changed (1) hide show

utils.py CHANGED Viewed

@@ -2,6 +2,7 @@ import torch
 import nltk
 from nltk import pos_tag
 from nltk.tokenize import word_tokenize
 from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
 from torch import nn
 from itertools import chain
@@ -16,7 +17,7 @@ nltk.download('punkt')
 nltk.download('punkt_tab')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('averaged_perceptron_tagger_eng')
-# nltk.download('wordnet')
 # nltk.download('omw-1.4')
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
@@ -835,6 +836,12 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
 p = inflect.engine()
 def is_plural(word):
     """Determine if a word is plural using NLTK's part-of-speech tagging."""
     # Tokenize the input word (necessary for NLTK tagging)
@@ -853,6 +860,8 @@ def plural_to_singular(word):
     """Convert plural word to singular using inflect."""
     if is_plural(word):
         return p.singular_noun(word) or word
     return word
 plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]

 import nltk
 from nltk import pos_tag
 from nltk.tokenize import word_tokenize
+from nltk.corpus import wordnet
 from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
 from torch import nn
 from itertools import chain
 nltk.download('punkt_tab')
 nltk.download('averaged_perceptron_tagger')
 nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('wordnet')
 # nltk.download('omw-1.4')
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
 p = inflect.engine()
+def is_plural_wordnet(word):
+    # Check if WordNet has both singular and plural forms
+    singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
+    plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
+    return len(plural_synsets) > len(singular_synsets)
 def is_plural(word):
     """Determine if a word is plural using NLTK's part-of-speech tagging."""
     # Tokenize the input word (necessary for NLTK tagging)
     """Convert plural word to singular using inflect."""
     if is_plural(word):
         return p.singular_noun(word) or word
+    if is_plural_wordnet(word):
+        return p.singular_noun(word) or word
     return word
 plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]