File size: 4,855 Bytes
032e687 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 | # Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
"""Provides various text related util function"""
import re
from typing import List, Tuple
import nltk
import spacy
nlp = spacy.load("en_core_web_sm")
nltk.download("stopwords")
from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words("english")) - set(["above", "below", "between", "further", "he", "she", "they"])
def get_noun_phrase(root):
queue = [root]
all_toks = [root]
while len(queue) > 0:
curr = queue.pop()
if curr.tag_ in ["NN", "NNS", "NNP", "NNPS"]:
queue += curr.lefts
all_toks += curr.lefts
return all_toks
def get_root_and_nouns(text: str, lazy=True) -> Tuple[str, str, List[Tuple[int, int]], List[Tuple[int, int]]]:
"""Given a sentence, returns a tuple with the following items:
-- root text:str : the text associated with the root of the sentence
-- negative_text:str: all the text that shouldn't be positively matched with a box other than the main one
-- root_span: List[Tuple[int, int]] spans covering the root expressions, returned as a list of (beg, end) character spans
-- negative_span: List[Tuple[int, int]] spans covering the negative expressions, returned as a list of (beg, end) character spans
If lazy is False, then we try a bit harder to find the precise root of the sentence
"""
sents = nlp(text)
negative_text = []
if len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) <= 1:
if lazy or len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) == 0:
return text, " ", [(0, len(text))], [(0, len(text))]
root = None
for token in sents:
if token.dep_ == "ROOT":
if token.tag_ == "UH":
continue
root = token
break
if root is None:
return text, "", [(0, len(text))], [(0, len(text))]
if (
len([c for c in root.children if c.tag_ in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] and c.dep_ == "compound"])
> 0
):
return text, "", [(0, len(text))], [(0, len(text))]
all_toks = []
if root.tag_ in ["NN", "NNS", "NNP", "NNPS"]:
all_toks = get_noun_phrase(root)
root_text = " ".join([x.text for x in all_toks])
root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks]
else:
root = [x for x in root.children if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]
if len(root) < 1:
return text, "", [(0, len(text))], [(0, len(text))]
else:
root = root[0]
all_toks = list(root.lefts) + [root]
root_text = " ".join([x.text for x in all_toks])
root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks]
everything_else = set()
for token in sents:
if token.tag_ in ["NN", "NNS", "NNP", "NNPS"] and token.dep_ not in ["ROOT"] and token not in all_toks:
everything_else = everything_else.union(set(get_noun_phrase(token)))
negative_tokens = set(sents) - set(everything_else)
negative_text = " ".join([x.text for x in negative_tokens])
negative_spans = [(x.idx, x.idx + len(x.text)) for x in negative_tokens]
return root_text, negative_text, root_spans, negative_spans
def normalize_sentence(sentence):
"""Returns a list of non stopwords for the sentence, obtained after cleaning ponctuation and spaces"""
sent = sentence.lower()
sent = remove_punctuation(sentence.lower())
sent = normalize_whitespace(sent)
tokens = nlp(sent)
return " ".join(
[
tokens[i].lemma_ if tokens[i].lemma_[0] != "-" else w
for i, w in enumerate(sent.split(" "))
if w not in STOP_WORDS
]
)
def remove_punctuation(text):
"""
This function removes all ponctuation.
"""
corrected = str(text)
corrected = re.sub(r"([!?,;.:-])", r"", corrected)
return corrected
def simplify_punctuation(text):
"""
This function simplifies doubled or more complex punctuation. The exception is '...'.
"""
corrected = str(text)
corrected = re.sub(r"([!?,;:-])\1+", r"\1", corrected)
corrected = re.sub(r"\.{2,}", r"...", corrected)
corrected = re.sub(r"\s?-\s?", r"-", corrected)
return corrected
def normalize_whitespace(text):
"""
This function normalizes whitespaces, removing duplicates and converting all to standard spaces
"""
corrected = str(text)
corrected = re.sub(r"//t", r"\t", corrected)
corrected = re.sub(r"\n", r" ", corrected)
corrected = re.sub(r"_", r" ", corrected)
corrected = re.sub(r"\r", r" ", corrected)
corrected = re.sub(r"\t", r" ", corrected)
corrected = re.sub(r"\s+", r" ", corrected)
return corrected.strip(" ")
|