File size: 7,824 Bytes
8228dae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
#*********************************************************************


# This archive could be a potential first stone of the project.
# Now contains only functions used throughout the files, but 
# in the future could contain more complex structures.  


#*********************************************************************
import pdfplumber
import docx2txt
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, models,util
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
nltk.download("punkt")



def reading_word(string):
    text = docx2txt.process("var.docx")
    return text

def reading_pdf(string):
    all_text=""
    with pdfplumber.open(string) as pdf:
        for pdf_page in pdf.pages:
            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
            single_page_text = bold.extract_text(x_tolerance=2)   
        #print( single_page_text )
        # separate each page's text with newline
            all_text = all_text + '\n' + single_page_text
    return all_text


def reading_file(string):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. 
    For the moment we detect only: PDF and Words.

    Returns: Long string with all the sentences in the document

    -----------------------------------------------------------------------------
    
    Input:

    string: path of the file we want to analyze

    """

    ext = os.path.splitext(string)[-1].lower()
    if ext == ".pdf":
        text=reading_pdf(string)
    elif ext == ".docx":
        text=reading_word(string)
    else:
        print ("Unknown file format.")
    return text


def splitting(word: str, text):
    if word=="line":
        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
    elif word=="sentences":
        #tok_text1=text.split('. ') 
        tok_text=sent_tokenize(text)
    elif word=="paragraphs":
        tok_text=re.split(r'\n{2,}', text)
        for i in tok_text:
            if len(i)<50:
                tok_text.remove(i)

    elif word=="words":
        tok_text=wordpunct_tokenize(text)
    return tok_text


def filtering(text):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the string obtained in the reading step and filters out undesired characters. 

    Potential things to filter: Index of contents, titles, formulas, references, tables (?) 
    
    
    Returns: Long string with all the sentences in the document.

    -----------------------------------------------------------------------------
    
    Input:

    string: string obtained in the previous reading step.

    """    
    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
    clean1=re.sub("\no |\n\uf0b7","",clean1)
    #clean1=re.sub(" \n"," ",clean1)
    return clean1


def ctrlf(words: list, text):
    b=[]
    for word in words:
        #print("Sentences matching the word ", word, ":\n")
        a=re.findall(f"[^.]* {word} [^.]*\.", text) 
        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
        for i in range(len(a)):
            #print(i+1,".-", a[i])
            b = b + [a[i]]
        #print("--------------------------------------------------")
    return b


def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the text that we want to compare, the query with respect to we want to 
    compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
    to compute the similarity (by defect cosine similarity).

    Returns: Histogram plot

    -----------------------------------------------------------------------------
    
    Input:

    query: String
    corpus: String or list of strings (usually the latter for a document --> list of sentences)
    number: Int
    model_name: String
    score_function: Function
    ax: Axis object

    """

    # model info retrieval
    model = SentenceTransformer(model_name)
    n=len(query)

    # tokenize according to the model 
    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)   

    # semantic search gives a list of lists composed of dictionaries
    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
    hits = hits[0]
    #print("Comparing ", query, " VS:")
    
    scoring=[]
    corp=[]
    for hit in hits:  
        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
        scoring.append(hit['score'])
        corp.append(corpus[hit['corpus_id']])
    
    # defining dataframe for easiness in plotting
    data = pd.DataFrame(np.column_stack([corp, scoring]), 
                               columns=['Expression', 'Score'])
    data.sort_values(by=['Score'], ascending=False)
    data = data.explode('Score')
    data['Score'] = data['Score'].astype('float')

    return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')


def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
    # model info retrieval
    model = SentenceTransformer(model_name)
    n=len(query)

    # tokenize according to the model 
    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)   

    # semantic search gives a list of lists composed of dictionaries
    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
    hits = hits[0]
    #print("Comparing ", query, " VS:")
    
    scoring=[]
    corp=[]
    for hit in hits:  
        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
        scoring.append(hit['score'])
        corp.append(corpus[hit['corpus_id']])
    
    # defining dataframe for easiness in plotting
    data = pd.DataFrame(np.column_stack([corp, scoring]), 
                               columns=['Expression', 'Score'])
    data.sort_values(by=['Score'], ascending=False)
    data = data.explode('Score')
    data['Score'] = data['Score'].astype('float')
    return data


def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
    frames=[]
    for i in query:
        frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
    
    result = pd.DataFrame(frames)
    result=result.sort_values(by=['Score'], ascending=False)
    result.drop_duplicates(subset=['Expression'], inplace=True)
    return result


############ EXTRA BALL ################
# detecting the conclusion and getting all the sentences of that paragraph for future use.
def conclusion():
    return 


########## Get a function with the distribution of the results per word