Spaces:

belgrano91
/

SentenceRecognizer

Build error

App Files Files Community

SentenceRecognizer / functions.py

belgrano91

Upload functions.py

8228dae over 3 years ago

raw

history blame contribute delete

7.82 kB

	#*********************************************************************


	# This archive could be a potential first stone of the project.
	# Now contains only functions used throughout the files, but
	# in the future could contain more complex structures.


	#*********************************************************************
	import pdfplumber
	import docx2txt
	import os
	import re
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	import seaborn as sns
	from sentence_transformers import SentenceTransformer, models,util
	import nltk
	from nltk.tokenize import sent_tokenize, wordpunct_tokenize
	nltk.download("punkt")



	def reading_word(string):
	text = docx2txt.process("var.docx")
	return text

	def reading_pdf(string):
	all_text=""
	with pdfplumber.open(string) as pdf:
	for pdf_page in pdf.pages:
	bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
	single_page_text = bold.extract_text(x_tolerance=2)
	#print( single_page_text )
	# separate each page's text with newline
	all_text = all_text + '\n' + single_page_text
	return all_text


	def reading_file(string):
	""""
	-----------------------------------------------------------------------------

	This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
	For the moment we detect only: PDF and Words.

	Returns: Long string with all the sentences in the document

	-----------------------------------------------------------------------------

	Input:

	string: path of the file we want to analyze

	"""

	ext = os.path.splitext(string)[-1].lower()
	if ext == ".pdf":
	text=reading_pdf(string)
	elif ext == ".docx":
	text=reading_word(string)
	else:
	print ("Unknown file format.")
	return text


	def splitting(word: str, text):
	if word=="line":
	tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
	elif word=="sentences":
	#tok_text1=text.split('. ')
	tok_text=sent_tokenize(text)
	elif word=="paragraphs":
	tok_text=re.split(r'\n{2,}', text)
	for i in tok_text:
	if len(i)<50:
	tok_text.remove(i)

	elif word=="words":
	tok_text=wordpunct_tokenize(text)
	return tok_text


	def filtering(text):
	""""
	-----------------------------------------------------------------------------

	This function takes as arguments the string obtained in the reading step and filters out undesired characters.

	Potential things to filter: Index of contents, titles, formulas, references, tables (?)


	Returns: Long string with all the sentences in the document.

	-----------------------------------------------------------------------------

	Input:

	string: string obtained in the previous reading step.

	"""
	clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
	clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n\|\w{1,} \.{4,} \d{1,}\d{1,}\n\|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
	clean1=re.sub(" \n\d{1,} \n \| \n\d{1,} \n \n \|\d{1,}\. \w{1,} \w{1,}", "", clean1)
	clean1=re.sub("\.{4,} \d{1,}\|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
	clean1=re.sub("\n\n\n\n\n+\|\n \n+", " ",clean1)#filtering long page jumps
	clean1=re.sub("\no \|\n\uf0b7","",clean1)
	#clean1=re.sub(" \n"," ",clean1)
	return clean1


	def ctrlf(words: list, text):
	b=[]
	for word in words:
	#print("Sentences matching the word ", word, ":\n")
	a=re.findall(f"[^.]* {word} [^.]*\.", text)
	#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
	for i in range(len(a)):
	#print(i+1,".-", a[i])
	b = b + [a[i]]
	#print("--------------------------------------------------")
	return b


	def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
	""""
	-----------------------------------------------------------------------------

	This function takes as arguments the text that we want to compare, the query with respect to we want to
	compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
	to compute the similarity (by defect cosine similarity).

	Returns: Histogram plot

	-----------------------------------------------------------------------------

	Input:

	query: String
	corpus: String or list of strings (usually the latter for a document --> list of sentences)
	number: Int
	model_name: String
	score_function: Function
	ax: Axis object

	"""

	# model info retrieval
	model = SentenceTransformer(model_name)
	n=len(query)

	# tokenize according to the model
	corpus_embedding = model.encode(corpus, convert_to_tensor=True)
	query_embedding = model.encode(query, convert_to_tensor=True)

	# semantic search gives a list of lists composed of dictionaries
	hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
	hits = hits[0]
	#print("Comparing ", query, " VS:")

	scoring=[]
	corp=[]
	for hit in hits:
	#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
	scoring.append(hit['score'])
	corp.append(corpus[hit['corpus_id']])

	# defining dataframe for easiness in plotting
	data = pd.DataFrame(np.column_stack([corp, scoring]),
	columns=['Expression', 'Score'])
	data.sort_values(by=['Score'], ascending=False)
	data = data.explode('Score')
	data['Score'] = data['Score'].astype('float')

	return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')


	def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
	# model info retrieval
	model = SentenceTransformer(model_name)
	n=len(query)

	# tokenize according to the model
	corpus_embedding = model.encode(corpus, convert_to_tensor=True)
	query_embedding = model.encode(query, convert_to_tensor=True)

	# semantic search gives a list of lists composed of dictionaries
	hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
	hits = hits[0]
	#print("Comparing ", query, " VS:")

	scoring=[]
	corp=[]
	for hit in hits:
	#print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
	scoring.append(hit['score'])
	corp.append(corpus[hit['corpus_id']])

	# defining dataframe for easiness in plotting
	data = pd.DataFrame(np.column_stack([corp, scoring]),
	columns=['Expression', 'Score'])
	data.sort_values(by=['Score'], ascending=False)
	data = data.explode('Score')
	data['Score'] = data['Score'].astype('float')
	return data


	def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
	frames=[]
	for i in query:
	frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]

	result = pd.DataFrame(frames)
	result=result.sort_values(by=['Score'], ascending=False)
	result.drop_duplicates(subset=['Expression'], inplace=True)
	return result


	############ EXTRA BALL ################
	# detecting the conclusion and getting all the sentences of that paragraph for future use.
	def conclusion():
	return


	########## Get a function with the distribution of the results per word