File size: 3,651 Bytes
908351f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import regex
import re

def retrieve_text_cite(text, command):
    base_pattern = (
        r'\\' + command + r"(?:\[(?:.*?)\])*\{((?:[^{}]+|\{(?1)\})*)\}(?:\[(?:.*?)\])*"
    )

    def extract_text_inside_curly_braces(text):
        pattern = r"\{((?:[^{}]|(?R))*)\}"

        match = regex.search(pattern, text)

        if match:
            return match.group(1)
        else:
            return ""

    found_texts = []
    for match in regex.finditer(base_pattern, text):
        temp_substring = text[match.span()[0] : match.span()[1]]
        found_texts.append(extract_text_inside_curly_braces(temp_substring))

    return found_texts

def get_citing_sentences(content):
    content_new = re.sub(r'[\n]+', ' ', content) # keep only one \n
    content_new = re.sub(r'e\.g\.' , 'eg', content_new)
    content_new = re.sub(r'i\.e\.' , 'eg', content_new)
    content_new = re.sub(r'etc\.' , 'etc', content_new)
    content_new = re.sub(r' +', ' ', content_new)
    sentences = [sentence + '.' for sentence in content_new.split('.')]
    citing_sentences = [s for s in sentences if '\\cite' in s]
    results = {}
    for s in citing_sentences:
        citations = retrieve_text_cite(s, 'cite')
        final_citations = []
        for cite in citations:
            final_citations.extend(cite.split(','))
        results[s] = final_citations
    return results

def get_intro(content):
    sections = retrieve_text_cite(content, 'section')
    if sections == []:
        return ''
    try_intro = [x for x in sections if x.strip().lower() == 'introduction']
    if try_intro == []:
        return ''
    else:
        to_find = try_intro[0]
        ind = sections.index(to_find)
    if ind + 1 < len(sections):
        start_marker = f'\\section{{{sections[ind]}}}'
        end_marker = f'\\section{{{sections[ind+1]}}}'
        start_point = content.find(start_marker)
        end_point = content.find(end_marker)
        return content[start_point+len(start_marker):end_point]
    else:
        return ''

def get_related_works(content):
    sections = retrieve_text_cite(content, 'section')
    if sections == []:
        return ''
    possible_related = [
        "Literature Review",
        "Related Work",
        "Related Works",
        "Prior Work",
        "Prior Works",
        "Related Research",
        "Research Overview",
        "Previous Work",
        "Previous Works",
        "Review of the Literature",
        "Review of Related Literature",
        "Survey of Related Work",
        "Survey of Related Works",
        "Background",
        "Research Background",
        "Review of Prior Research",
        "Literature Survey",
        "Overview of Literature",
        "Existing Literature",
        "Review of Existing Work",
        "Review of Existing Works",
        "Review of Previous Studies",
        "Review of Prior Literature",
        "Summary of Related Research",
        "Survey of Existing Literature",
        "Survey of Literature",
        "Existing Research Overview",
        "Prior Literature Review"
    ]
    possible_sections = [x for x in sections if any([True for y in possible_related if y.lower() == x.strip().lower()])]
    if possible_sections == []:
        return ''
    else:
        to_find = possible_sections[0]
        ind = sections.index(to_find)

    if ind + 1 < len(sections):
        start_marker = f'\\section{{{sections[ind]}}}'
        end_marker = f'\\section{{{sections[ind+1]}}}'
        start_point = content.find(start_marker)
        end_point = content.find(end_marker)
        return content[start_point+len(start_marker):end_point]

    else:
        return ''