Spaces:
Build error
Build error
| #********************************************************************* | |
| # This archive could be a potential first stone of the project. | |
| # Now contains only functions used throughout the files, but | |
| # in the future could contain more complex structures. | |
| #********************************************************************* | |
| import pdfplumber | |
| import docx2txt | |
| import os | |
| import re | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sentence_transformers import SentenceTransformer, models,util | |
| import nltk | |
| from nltk.tokenize import sent_tokenize, wordpunct_tokenize | |
| nltk.download("punkt") | |
| def reading_word(string): | |
| text = docx2txt.process("var.docx") | |
| return text | |
| def reading_pdf(string): | |
| all_text="" | |
| with pdfplumber.open(string) as pdf: | |
| for pdf_page in pdf.pages: | |
| bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 )) | |
| single_page_text = bold.extract_text(x_tolerance=2) | |
| #print( single_page_text ) | |
| # separate each page's text with newline | |
| all_text = all_text + '\n' + single_page_text | |
| return all_text | |
| def reading_file(string): | |
| """" | |
| ----------------------------------------------------------------------------- | |
| This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. | |
| For the moment we detect only: PDF and Words. | |
| Returns: Long string with all the sentences in the document | |
| ----------------------------------------------------------------------------- | |
| Input: | |
| string: path of the file we want to analyze | |
| """ | |
| ext = os.path.splitext(string)[-1].lower() | |
| if ext == ".pdf": | |
| text=reading_pdf(string) | |
| elif ext == ".docx": | |
| text=reading_word(string) | |
| else: | |
| print ("Unknown file format.") | |
| return text | |
| def splitting(word: str, text): | |
| if word=="line": | |
| tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines | |
| elif word=="sentences": | |
| #tok_text1=text.split('. ') | |
| tok_text=sent_tokenize(text) | |
| elif word=="paragraphs": | |
| tok_text=re.split(r'\n{2,}', text) | |
| for i in tok_text: | |
| if len(i)<50: | |
| tok_text.remove(i) | |
| elif word=="words": | |
| tok_text=wordpunct_tokenize(text) | |
| return tok_text | |
| def filtering(text): | |
| """" | |
| ----------------------------------------------------------------------------- | |
| This function takes as arguments the string obtained in the reading step and filters out undesired characters. | |
| Potential things to filter: Index of contents, titles, formulas, references, tables (?) | |
| Returns: Long string with all the sentences in the document. | |
| ----------------------------------------------------------------------------- | |
| Input: | |
| string: string obtained in the previous reading step. | |
| """ | |
| clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents | |
| clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents | |
| clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1) | |
| clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index | |
| clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps | |
| clean1=re.sub("\no |\n\uf0b7","",clean1) | |
| #clean1=re.sub(" \n"," ",clean1) | |
| return clean1 | |
| def ctrlf(words: list, text): | |
| b=[] | |
| for word in words: | |
| #print("Sentences matching the word ", word, ":\n") | |
| a=re.findall(f"[^.]* {word} [^.]*\.", text) | |
| #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive | |
| for i in range(len(a)): | |
| #print(i+1,".-", a[i]) | |
| b = b + [a[i]] | |
| #print("--------------------------------------------------") | |
| return b | |
| def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None): | |
| """" | |
| ----------------------------------------------------------------------------- | |
| This function takes as arguments the text that we want to compare, the query with respect to we want to | |
| compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used | |
| to compute the similarity (by defect cosine similarity). | |
| Returns: Histogram plot | |
| ----------------------------------------------------------------------------- | |
| Input: | |
| query: String | |
| corpus: String or list of strings (usually the latter for a document --> list of sentences) | |
| number: Int | |
| model_name: String | |
| score_function: Function | |
| ax: Axis object | |
| """ | |
| # model info retrieval | |
| model = SentenceTransformer(model_name) | |
| n=len(query) | |
| # tokenize according to the model | |
| corpus_embedding = model.encode(corpus, convert_to_tensor=True) | |
| query_embedding = model.encode(query, convert_to_tensor=True) | |
| # semantic search gives a list of lists composed of dictionaries | |
| hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function) | |
| hits = hits[0] | |
| #print("Comparing ", query, " VS:") | |
| scoring=[] | |
| corp=[] | |
| for hit in hits: | |
| #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) | |
| scoring.append(hit['score']) | |
| corp.append(corpus[hit['corpus_id']]) | |
| # defining dataframe for easiness in plotting | |
| data = pd.DataFrame(np.column_stack([corp, scoring]), | |
| columns=['Expression', 'Score']) | |
| data.sort_values(by=['Score'], ascending=False) | |
| data = data.explode('Score') | |
| data['Score'] = data['Score'].astype('float') | |
| return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression') | |
| def sim(query, corpus, model_name, number=5, score_function=util.cos_sim): | |
| # model info retrieval | |
| model = SentenceTransformer(model_name) | |
| n=len(query) | |
| # tokenize according to the model | |
| corpus_embedding = model.encode(corpus, convert_to_tensor=True) | |
| query_embedding = model.encode(query, convert_to_tensor=True) | |
| # semantic search gives a list of lists composed of dictionaries | |
| hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function) | |
| hits = hits[0] | |
| #print("Comparing ", query, " VS:") | |
| scoring=[] | |
| corp=[] | |
| for hit in hits: | |
| #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score'])) | |
| scoring.append(hit['score']) | |
| corp.append(corpus[hit['corpus_id']]) | |
| # defining dataframe for easiness in plotting | |
| data = pd.DataFrame(np.column_stack([corp, scoring]), | |
| columns=['Expression', 'Score']) | |
| data.sort_values(by=['Score'], ascending=False) | |
| data = data.explode('Score') | |
| data['Score'] = data['Score'].astype('float') | |
| return data | |
| def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim): | |
| frames=[] | |
| for i in query: | |
| frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)] | |
| result = pd.DataFrame(frames) | |
| result=result.sort_values(by=['Score'], ascending=False) | |
| result.drop_duplicates(subset=['Expression'], inplace=True) | |
| return result | |
| ############ EXTRA BALL ################ | |
| # detecting the conclusion and getting all the sentences of that paragraph for future use. | |
| def conclusion(): | |
| return | |
| ########## Get a function with the distribution of the results per word |