stanford-nlpxed
/

transcript-analysis

Model card Files Files and versions

check-plurality

#5

by ikarasz - opened Mar 22, 2025

base: refs/heads/main

←

from: refs/pr/5

Discussion Files changed

Files changed (1) hide show

utils.py +40 -3

utils.py CHANGED Viewed

@@ -1,4 +1,8 @@
 import torch
 from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
 from torch import nn
 from itertools import chain
@@ -9,6 +13,13 @@ import re
 import string
 import inflect
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
 punct_chars.sort()
 punctuation = ''.join(punct_chars)
@@ -825,6 +836,32 @@ PLURAL_TO_SINGULAR_EXCLUSIONS = [
 p = inflect.engine()
 def singular_to_plural(word):
     """Convert singular words to plural using inflect."""
     plural = p.plural(word)
@@ -832,9 +869,9 @@ def singular_to_plural(word):
 def plural_to_singular(word):
     """Convert plural word to singular using inflect."""
-    if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
-        return word
-    return p.singular_noun(word) or word
 plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]

 import torch
+import nltk
+from nltk import pos_tag
+from nltk.tokenize import word_tokenize
+from nltk.corpus import wordnet
 from transformers.models.bert.modeling_bert import BertModel, BertPreTrainedModel
 from torch import nn
 from itertools import chain
 import string
 import inflect
+nltk.download('punkt')
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger')
+nltk.download('averaged_perceptron_tagger_eng')
+nltk.download('wordnet')
+# nltk.download('omw-1.4')
 punct_chars = list((set(string.punctuation) | {'’', '‘', '–', '—', '~', '|', '“', '”', '…', "'", "`", '_'}))
 punct_chars.sort()
 punctuation = ''.join(punct_chars)
 p = inflect.engine()
+def is_plural_regex(word):
+    """Detect if a word is plural using common pluralization rules."""
+    # Check for common plural forms
+    return re.search(r'(s$|es$|ies$)', word.lower()) and not re.search(r'(ss$)', word.lower())
+def is_plural_wordnet(word):
+    # Check if WordNet has both singular and plural forms
+    singular_synsets = wordnet.synsets(word, pos=wordnet.NOUN)
+    plural_synsets = wordnet.synsets(word.rstrip('s'), pos=wordnet.NOUN)
+    return len(plural_synsets) > len(singular_synsets)
+def is_plural_pos(word):
+    """Determine if a word is plural using NLTK's part-of-speech tagging."""
+    # Tokenize the input word (necessary for NLTK tagging)
+    tokens = word_tokenize(word)
+    # Get the part-of-speech tag for the word
+    pos = pos_tag(tokens)[0][1]
+    # Check if the word is tagged as plural (NNS or NNPS in Penn Treebank tags)
+    return pos in ["NNS", "NNPS"]
+def is_plural(word):
+    """Check if a word is plural."""
+    if word in PLURAL_TO_SINGULAR_EXCLUSIONS:
+        return False
+    return is_plural_regex(word) or is_plural_pos(word) or is_plural_wordnet(word)
 def singular_to_plural(word):
     """Convert singular words to plural using inflect."""
     plural = p.plural(word)
 def plural_to_singular(word):
     """Convert plural word to singular using inflect."""
+    if is_plural(word):
+        return p.singular_noun(word) or word
+    return word
 plural_MATH_WORDS = [singular_to_plural(word) for word in MATH_WORDS]