# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved """Provides various text related util function""" import re from typing import List, Tuple import nltk import spacy nlp = spacy.load("en_core_web_sm") nltk.download("stopwords") from nltk.corpus import stopwords STOP_WORDS = set(stopwords.words("english")) - set(["above", "below", "between", "further", "he", "she", "they"]) def get_noun_phrase(root): queue = [root] all_toks = [root] while len(queue) > 0: curr = queue.pop() if curr.tag_ in ["NN", "NNS", "NNP", "NNPS"]: queue += curr.lefts all_toks += curr.lefts return all_toks def get_root_and_nouns(text: str, lazy=True) -> Tuple[str, str, List[Tuple[int, int]], List[Tuple[int, int]]]: """Given a sentence, returns a tuple with the following items: -- root text:str : the text associated with the root of the sentence -- negative_text:str: all the text that shouldn't be positively matched with a box other than the main one -- root_span: List[Tuple[int, int]] spans covering the root expressions, returned as a list of (beg, end) character spans -- negative_span: List[Tuple[int, int]] spans covering the negative expressions, returned as a list of (beg, end) character spans If lazy is False, then we try a bit harder to find the precise root of the sentence """ sents = nlp(text) negative_text = [] if len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) <= 1: if lazy or len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) == 0: return text, " ", [(0, len(text))], [(0, len(text))] root = None for token in sents: if token.dep_ == "ROOT": if token.tag_ == "UH": continue root = token break if root is None: return text, "", [(0, len(text))], [(0, len(text))] if ( len([c for c in root.children if c.tag_ in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] and c.dep_ == "compound"]) > 0 ): return text, "", [(0, len(text))], [(0, len(text))] all_toks = [] if root.tag_ in ["NN", "NNS", "NNP", "NNPS"]: all_toks = get_noun_phrase(root) root_text = " ".join([x.text for x in all_toks]) root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks] else: root = [x for x in root.children if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]] if len(root) < 1: return text, "", [(0, len(text))], [(0, len(text))] else: root = root[0] all_toks = list(root.lefts) + [root] root_text = " ".join([x.text for x in all_toks]) root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks] everything_else = set() for token in sents: if token.tag_ in ["NN", "NNS", "NNP", "NNPS"] and token.dep_ not in ["ROOT"] and token not in all_toks: everything_else = everything_else.union(set(get_noun_phrase(token))) negative_tokens = set(sents) - set(everything_else) negative_text = " ".join([x.text for x in negative_tokens]) negative_spans = [(x.idx, x.idx + len(x.text)) for x in negative_tokens] return root_text, negative_text, root_spans, negative_spans def normalize_sentence(sentence): """Returns a list of non stopwords for the sentence, obtained after cleaning ponctuation and spaces""" sent = sentence.lower() sent = remove_punctuation(sentence.lower()) sent = normalize_whitespace(sent) tokens = nlp(sent) return " ".join( [ tokens[i].lemma_ if tokens[i].lemma_[0] != "-" else w for i, w in enumerate(sent.split(" ")) if w not in STOP_WORDS ] ) def remove_punctuation(text): """ This function removes all ponctuation. """ corrected = str(text) corrected = re.sub(r"([!?,;.:-])", r"", corrected) return corrected def simplify_punctuation(text): """ This function simplifies doubled or more complex punctuation. The exception is '...'. """ corrected = str(text) corrected = re.sub(r"([!?,;:-])\1+", r"\1", corrected) corrected = re.sub(r"\.{2,}", r"...", corrected) corrected = re.sub(r"\s?-\s?", r"-", corrected) return corrected def normalize_whitespace(text): """ This function normalizes whitespaces, removing duplicates and converting all to standard spaces """ corrected = str(text) corrected = re.sub(r"//t", r"\t", corrected) corrected = re.sub(r"\n", r" ", corrected) corrected = re.sub(r"_", r" ", corrected) corrected = re.sub(r"\r", r" ", corrected) corrected = re.sub(r"\t", r" ", corrected) corrected = re.sub(r"\s+", r" ", corrected) return corrected.strip(" ")