Spaces:

belgrano91
/

SentenceRecognizer

Build error

App Files Files Community

SentenceRecognizer / app.py

belgrano91

Update app.py

8f2b747 over 3 years ago

raw

history blame contribute delete

6.6 kB

	import gradio as gr
	import functions
	import docx2txt
	import pdfplumber
	import re
	import pandas as pd
	import nltk
	from nltk.tokenize import sent_tokenize
	nltk.download("punkt")
	from sentence_transformers import SentenceTransformer,util
	import numpy as np




	df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
	words=df.values.T[0].tolist()


	def reading_word(string):
	text = docx2txt.process("var.docx")
	return text

	def reading_pdf(string):
	all_text=""
	with pdfplumber.open(string) as pdf:
	for pdf_page in pdf.pages:
	bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
	single_page_text = bold.extract_text(x_tolerance=2)
	#print( single_page_text )
	# separate each page's text with newline
	all_text = all_text + '\n' + single_page_text
	return all_text


	def reading_file(file_obj):
	string=file_obj.orig_name
	""""
	-----------------------------------------------------------------------------

	This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
	For the moment we detect only: PDF and Words.

	Returns: Long string with all the sentences in the document

	-----------------------------------------------------------------------------

	Input:

	string: path of the file we want to analyze

	"""

	ext = os.path.splitext(string)[-1].lower()
	if ext == ".pdf":
	text=reading_pdf(string)
	elif ext == ".docx":
	text=reading_word(string)
	else:
	print ("Unknown file format.")
	return text

	def filtering(text):
	""""
	-----------------------------------------------------------------------------

	This function takes as arguments the string obtained in the reading step and filters out undesired characters.

	Potential things to filter: Index of contents, titles, formulas, references, tables (?)


	Returns: Long string with all the sentences in the document.

	-----------------------------------------------------------------------------

	Input:

	string: string obtained in the previous reading step.

	"""
	clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
	clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n\|\w{1,} \.{4,} \d{1,}\d{1,}\n\|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
	clean1=re.sub(" \n\d{1,} \n \| \n\d{1,} \n \n \|\d{1,}\. \w{1,} \w{1,}", "", clean1)
	clean1=re.sub("\.{4,} \d{1,}\|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
	clean1=re.sub("\n\n\n\n\n+\|\n \n+", " ",clean1)#filtering long page jumps
	clean1=re.sub("\no \|\n\uf0b7","",clean1)
	#clean1=re.sub(" \n"," ",clean1)
	return clean1

	def splitting(word, text):
	if word=="line":
	tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
	elif word=="sentences":
	#tok_text1=text.split('. ')
	tok_text=sent_tokenize(text)
	elif word=="paragraphs":
	tok_text=text.split('\n\n')
	#tok_text= [content.strip() for content in text.splitlines() if content]
	return tok_text


	def ctrlf(words: list, text):
	b=[]
	for word in words:
	#print("Sentences matching the word ", word, ":\n")
	a=re.findall(f"[^.]* {word} [^.]*\.", text)
	#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
	for i in range(len(a)):
	#print(i+1,".-", a[i])
	b = b + [a[i]]
	#print("--------------------------------------------------")
	return b


	def total(corpus, query, split_param, model_name: str ,number: int, function: str):
	"""
	Takes filtered text and performs the NLP nalysis
	"""
	splitted=splitting(split_param, corpus)

	if function=="cosine similarity":
	score_function=util.cos_sim
	elif function=="dot score":
	score_function=util.dot_score
	else:
	print("Choose a valid option")

	#frames=[]
	#for i in query:
	result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)

	return result

	demo=gr.Blocks()
	with demo:
	gr.Image("logo_credit_agricole_CIB_0.jpg")
	gr.Markdown("## Important Sentences Recognizer")
	gr.Markdown("This project aims to retrieve critical sentences related with some important words in a document.")
	#gr.Interface(fn=reading_file, inputs=gr.File(), outputs="text")
	with gr.Box():
	with gr.Row():
	file=gr.File()
	with gr.Column():
	b1=gr.Button("Reading file",variant="primary")
	t1=gr.Textbox(label="Result")

	b2=gr.Button("Filtering")
	t2=gr.Textbox(label="Result")

	gr.Markdown("Now we run ctrl+f method.")
	with gr.Box():
	checkbox1=gr.CheckboxGroup(words, label="Select desired words")
	b4=gr.Button("Run analysis")
	t4=gr.Textbox(label="Result")

	gr.Markdown("But first we need to choose how to parse the text.")
	with gr.Box():
	t=gr.Textbox(label="Write: sentences or paragraphs or lines or words", value="sentences")
	#radio1=gr.Radio(["lines", "sentences", "paragraphs"], label="Parse by", value="sentences", interactive=True),
	b3=gr.Button("Split text")
	t3=gr.Textbox(label="Result")


	gr.Markdown("Using previous the previous result, we run now the NLP analysis.")
	with gr.Box():

	gr.Markdown("Now we will proceed with the analysis.")
	dropdown1=gr.Dropdown(choices=["all-MiniLM-L6-v2","multi-qa-mpnet-base-dot-v1","msmarco-distilbert-base-v4"], label="Model")
	slider1=gr.Slider(1,100,10, label="Top k", interactive=True, step=1)
	dropdown2=gr.Dropdown(choices=["cosine similarity","dot product"], label="Similarity function")
	b5=gr.Button("Run analysis", variant="primary")


	gr.Markdown
	df1= gr.Dataframe(row_count = (1, "dynamic"), col_count=(2, "fixed"), label="Important sentences", headers=["Expression", "Score"], overflow_row_behaviour="paginate")

	b1.click(reading_file, inputs=file, outputs=t1)
	b2.click(filtering, inputs=t1, outputs=t2)
	b3.click(splitting, inputs=[t, t2], outputs=t3)
	b4.click(ctrlf,[checkbox1, t2], t4)
	b5.click(fn=total, inputs=[t2, t4,t, dropdown1, slider1, dropdown2], outputs=df1)
	demo.launch()