File size: 6,596 Bytes
cdd06f9
fc46d80
ec821af
 
 
93de492
ec821af
 
 
 
 
 
93de492
cdd06f9
 
ae89b7e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93de492
cdd06f9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8f2b747
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import gradio as gr
import functions
import docx2txt
import pdfplumber
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
from sentence_transformers import SentenceTransformer,util
import numpy as np




df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
words=df.values.T[0].tolist() 


def reading_word(string):
    text = docx2txt.process("var.docx")
    return text

def reading_pdf(string):
    all_text=""
    with pdfplumber.open(string) as pdf:
        for pdf_page in pdf.pages:
            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
            single_page_text = bold.extract_text(x_tolerance=2)   
        #print( single_page_text )
        # separate each page's text with newline
            all_text = all_text + '\n' + single_page_text
    return all_text


def reading_file(file_obj):
    string=file_obj.orig_name
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. 
    For the moment we detect only: PDF and Words.

    Returns: Long string with all the sentences in the document

    -----------------------------------------------------------------------------
    
    Input:

    string: path of the file we want to analyze

    """

    ext = os.path.splitext(string)[-1].lower()
    if ext == ".pdf":
        text=reading_pdf(string)
    elif ext == ".docx":
        text=reading_word(string)
    else:
        print ("Unknown file format.")
    return text

def filtering(text):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the string obtained in the reading step and filters out undesired characters. 

    Potential things to filter: Index of contents, titles, formulas, references, tables (?) 
    
    
    Returns: Long string with all the sentences in the document.

    -----------------------------------------------------------------------------
    
    Input:

    string: string obtained in the previous reading step.

    """    
    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
    clean1=re.sub("\no |\n\uf0b7","",clean1)
    #clean1=re.sub(" \n"," ",clean1)
    return clean1

def splitting(word, text):
    if word=="line":
        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
    elif word=="sentences":
        #tok_text1=text.split('. ') 
        tok_text=sent_tokenize(text)
    elif word=="paragraphs":
        tok_text=text.split('\n\n')
        #tok_text= [content.strip() for content in text.splitlines() if content]
    return tok_text


def ctrlf(words: list, text):
    b=[]
    for word in words:
        #print("Sentences matching the word ", word, ":\n")
        a=re.findall(f"[^.]* {word} [^.]*\.", text) 
        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
        for i in range(len(a)):
            #print(i+1,".-", a[i])
            b = b + [a[i]]
        #print("--------------------------------------------------")
    return b


def total(corpus, query, split_param, model_name: str ,number: int, function: str):
    """
    Takes filtered text and performs the NLP nalysis
    """
    splitted=splitting(split_param, corpus)

    if function=="cosine similarity":
        score_function=util.cos_sim
    elif function=="dot score":
        score_function=util.dot_score
    else:
        print("Choose a valid option")

    #frames=[]
    #for i in query:
    result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)

    return result
    
demo=gr.Blocks()
with demo:    
    gr.Image("logo_credit_agricole_CIB_0.jpg")
    gr.Markdown("## Important Sentences Recognizer")
    gr.Markdown("This project aims to retrieve critical sentences related with some important words in a document.")
#gr.Interface(fn=reading_file, inputs=gr.File(),  outputs="text")
    with gr.Box():
        with gr.Row():
            file=gr.File()
            with gr.Column():
                b1=gr.Button("Reading file",variant="primary")
                t1=gr.Textbox(label="Result")
        
                b2=gr.Button("Filtering")
                t2=gr.Textbox(label="Result")
   
    gr.Markdown("Now we run ctrl+f method.")
    with gr.Box():
        checkbox1=gr.CheckboxGroup(words, label="Select desired words")
        b4=gr.Button("Run analysis")
        t4=gr.Textbox(label="Result")

    gr.Markdown("But first we need to choose how to parse the text.")
    with gr.Box():
        t=gr.Textbox(label="Write: sentences or paragraphs or lines or words", value="sentences")
        #radio1=gr.Radio(["lines", "sentences", "paragraphs"], label="Parse by", value="sentences", interactive=True),
        b3=gr.Button("Split text")
        t3=gr.Textbox(label="Result")


    gr.Markdown("Using previous the previous result, we run now the NLP analysis.")
    with gr.Box():
        
        gr.Markdown("Now we will proceed with the analysis.")
        dropdown1=gr.Dropdown(choices=["all-MiniLM-L6-v2","multi-qa-mpnet-base-dot-v1","msmarco-distilbert-base-v4"], label="Model")
        slider1=gr.Slider(1,100,10, label="Top k", interactive=True, step=1)
        dropdown2=gr.Dropdown(choices=["cosine similarity","dot product"], label="Similarity function")
        b5=gr.Button("Run analysis", variant="primary")


    gr.Markdown
    df1= gr.Dataframe(row_count = (1, "dynamic"), col_count=(2, "fixed"), label="Important sentences", headers=["Expression", "Score"], overflow_row_behaviour="paginate")

    b1.click(reading_file, inputs=file, outputs=t1)
    b2.click(filtering, inputs=t1, outputs=t2)
    b3.click(splitting, inputs=[t, t2], outputs=t3)
    b4.click(ctrlf,[checkbox1, t2], t4)
    b5.click(fn=total, inputs=[t2, t4,t, dropdown1, slider1, dropdown2], outputs=df1)           
demo.launch()