File size: 4,855 Bytes
032e687
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
# Copyright (c) Aishwarya Kamath & Nicolas Carion. Licensed under the Apache License 2.0. All Rights Reserved
"""Provides various text related util function"""
import re
from typing import List, Tuple

import nltk
import spacy

nlp = spacy.load("en_core_web_sm")

nltk.download("stopwords")
from nltk.corpus import stopwords

STOP_WORDS = set(stopwords.words("english")) - set(["above", "below", "between", "further", "he", "she", "they"])


def get_noun_phrase(root):
    queue = [root]
    all_toks = [root]
    while len(queue) > 0:
        curr = queue.pop()
        if curr.tag_ in ["NN", "NNS", "NNP", "NNPS"]:
            queue += curr.lefts
            all_toks += curr.lefts
    return all_toks


def get_root_and_nouns(text: str, lazy=True) -> Tuple[str, str, List[Tuple[int, int]], List[Tuple[int, int]]]:
    """Given a sentence, returns a tuple with the following items:
    -- root text:str  : the text associated with the root of the sentence
    -- negative_text:str: all the text that shouldn't be positively matched with a box other than the main one
    -- root_span: List[Tuple[int, int]] spans covering the root expressions, returned as a list of (beg, end) character spans
    -- negative_span: List[Tuple[int, int]] spans covering the negative expressions, returned as a list of (beg, end) character spans

    If lazy is False, then we try a bit harder to find the precise root of the sentence
    """
    sents = nlp(text)
    negative_text = []

    if len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) <= 1:
        if lazy or len([x for x in sents if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]) == 0:
            return text, " ", [(0, len(text))], [(0, len(text))]

    root = None
    for token in sents:
        if token.dep_ == "ROOT":
            if token.tag_ == "UH":
                continue
            root = token
            break

    if root is None:
        return text, "", [(0, len(text))], [(0, len(text))]

    if (
        len([c for c in root.children if c.tag_ in ["VB", "VBD", "VBG", "VBN", "VBP", "VBZ"] and c.dep_ == "compound"])
        > 0
    ):
        return text, "", [(0, len(text))], [(0, len(text))]

    all_toks = []
    if root.tag_ in ["NN", "NNS", "NNP", "NNPS"]:
        all_toks = get_noun_phrase(root)
        root_text = " ".join([x.text for x in all_toks])
        root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks]
    else:
        root = [x for x in root.children if x.tag_ in ["NN", "NNS", "NNP", "NNPS", "PRP"]]
        if len(root) < 1:
            return text, "", [(0, len(text))], [(0, len(text))]
        else:
            root = root[0]
        all_toks = list(root.lefts) + [root]
        root_text = " ".join([x.text for x in all_toks])
        root_spans = [(x.idx, x.idx + len(x.text)) for x in all_toks]

    everything_else = set()
    for token in sents:
        if token.tag_ in ["NN", "NNS", "NNP", "NNPS"] and token.dep_ not in ["ROOT"] and token not in all_toks:
            everything_else = everything_else.union(set(get_noun_phrase(token)))

    negative_tokens = set(sents) - set(everything_else)
    negative_text = " ".join([x.text for x in negative_tokens])
    negative_spans = [(x.idx, x.idx + len(x.text)) for x in negative_tokens]

    return root_text, negative_text, root_spans, negative_spans


def normalize_sentence(sentence):
    """Returns a list of non stopwords for the sentence, obtained after cleaning ponctuation and spaces"""

    sent = sentence.lower()
    sent = remove_punctuation(sentence.lower())
    sent = normalize_whitespace(sent)
    tokens = nlp(sent)
    return " ".join(
        [
            tokens[i].lemma_ if tokens[i].lemma_[0] != "-" else w
            for i, w in enumerate(sent.split(" "))
            if w not in STOP_WORDS
        ]
    )


def remove_punctuation(text):
    """
    This function removes all ponctuation.
    """
    corrected = str(text)
    corrected = re.sub(r"([!?,;.:-])", r"", corrected)
    return corrected


def simplify_punctuation(text):
    """
    This function simplifies doubled or more complex punctuation. The exception is '...'.
    """
    corrected = str(text)
    corrected = re.sub(r"([!?,;:-])\1+", r"\1", corrected)
    corrected = re.sub(r"\.{2,}", r"...", corrected)
    corrected = re.sub(r"\s?-\s?", r"-", corrected)
    return corrected


def normalize_whitespace(text):
    """
    This function normalizes whitespaces, removing duplicates and converting all to standard spaces
    """
    corrected = str(text)
    corrected = re.sub(r"//t", r"\t", corrected)
    corrected = re.sub(r"\n", r" ", corrected)
    corrected = re.sub(r"_", r" ", corrected)
    corrected = re.sub(r"\r", r" ", corrected)
    corrected = re.sub(r"\t", r" ", corrected)
    corrected = re.sub(r"\s+", r" ", corrected)
    return corrected.strip(" ")