Spaces:
Build error
Build error
File size: 6,596 Bytes
cdd06f9 fc46d80 ec821af 93de492 ec821af 93de492 cdd06f9 ae89b7e 93de492 cdd06f9 8f2b747 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | import gradio as gr
import functions
import docx2txt
import pdfplumber
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
from sentence_transformers import SentenceTransformer,util
import numpy as np
df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
words=df.values.T[0].tolist()
def reading_word(string):
text = docx2txt.process("var.docx")
return text
def reading_pdf(string):
all_text=""
with pdfplumber.open(string) as pdf:
for pdf_page in pdf.pages:
bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
single_page_text = bold.extract_text(x_tolerance=2)
#print( single_page_text )
# separate each page's text with newline
all_text = all_text + '\n' + single_page_text
return all_text
def reading_file(file_obj):
string=file_obj.orig_name
""""
-----------------------------------------------------------------------------
This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
For the moment we detect only: PDF and Words.
Returns: Long string with all the sentences in the document
-----------------------------------------------------------------------------
Input:
string: path of the file we want to analyze
"""
ext = os.path.splitext(string)[-1].lower()
if ext == ".pdf":
text=reading_pdf(string)
elif ext == ".docx":
text=reading_word(string)
else:
print ("Unknown file format.")
return text
def filtering(text):
""""
-----------------------------------------------------------------------------
This function takes as arguments the string obtained in the reading step and filters out undesired characters.
Potential things to filter: Index of contents, titles, formulas, references, tables (?)
Returns: Long string with all the sentences in the document.
-----------------------------------------------------------------------------
Input:
string: string obtained in the previous reading step.
"""
clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
clean1=re.sub("\no |\n\uf0b7","",clean1)
#clean1=re.sub(" \n"," ",clean1)
return clean1
def splitting(word, text):
if word=="line":
tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
elif word=="sentences":
#tok_text1=text.split('. ')
tok_text=sent_tokenize(text)
elif word=="paragraphs":
tok_text=text.split('\n\n')
#tok_text= [content.strip() for content in text.splitlines() if content]
return tok_text
def ctrlf(words: list, text):
b=[]
for word in words:
#print("Sentences matching the word ", word, ":\n")
a=re.findall(f"[^.]* {word} [^.]*\.", text)
#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
for i in range(len(a)):
#print(i+1,".-", a[i])
b = b + [a[i]]
#print("--------------------------------------------------")
return b
def total(corpus, query, split_param, model_name: str ,number: int, function: str):
"""
Takes filtered text and performs the NLP nalysis
"""
splitted=splitting(split_param, corpus)
if function=="cosine similarity":
score_function=util.cos_sim
elif function=="dot score":
score_function=util.dot_score
else:
print("Choose a valid option")
#frames=[]
#for i in query:
result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)
return result
demo=gr.Blocks()
with demo:
gr.Image("logo_credit_agricole_CIB_0.jpg")
gr.Markdown("## Important Sentences Recognizer")
gr.Markdown("This project aims to retrieve critical sentences related with some important words in a document.")
#gr.Interface(fn=reading_file, inputs=gr.File(), outputs="text")
with gr.Box():
with gr.Row():
file=gr.File()
with gr.Column():
b1=gr.Button("Reading file",variant="primary")
t1=gr.Textbox(label="Result")
b2=gr.Button("Filtering")
t2=gr.Textbox(label="Result")
gr.Markdown("Now we run ctrl+f method.")
with gr.Box():
checkbox1=gr.CheckboxGroup(words, label="Select desired words")
b4=gr.Button("Run analysis")
t4=gr.Textbox(label="Result")
gr.Markdown("But first we need to choose how to parse the text.")
with gr.Box():
t=gr.Textbox(label="Write: sentences or paragraphs or lines or words", value="sentences")
#radio1=gr.Radio(["lines", "sentences", "paragraphs"], label="Parse by", value="sentences", interactive=True),
b3=gr.Button("Split text")
t3=gr.Textbox(label="Result")
gr.Markdown("Using previous the previous result, we run now the NLP analysis.")
with gr.Box():
gr.Markdown("Now we will proceed with the analysis.")
dropdown1=gr.Dropdown(choices=["all-MiniLM-L6-v2","multi-qa-mpnet-base-dot-v1","msmarco-distilbert-base-v4"], label="Model")
slider1=gr.Slider(1,100,10, label="Top k", interactive=True, step=1)
dropdown2=gr.Dropdown(choices=["cosine similarity","dot product"], label="Similarity function")
b5=gr.Button("Run analysis", variant="primary")
gr.Markdown
df1= gr.Dataframe(row_count = (1, "dynamic"), col_count=(2, "fixed"), label="Important sentences", headers=["Expression", "Score"], overflow_row_behaviour="paginate")
b1.click(reading_file, inputs=file, outputs=t1)
b2.click(filtering, inputs=t1, outputs=t2)
b3.click(splitting, inputs=[t, t2], outputs=t3)
b4.click(ctrlf,[checkbox1, t2], t4)
b5.click(fn=total, inputs=[t2, t4,t, dropdown1, slider1, dropdown2], outputs=df1)
demo.launch() |