Spaces:

belgrano91
/

SentenceRecognizer

Build error

File size: 7,824 Bytes

8228dae

#*********************************************************************


# This archive could be a potential first stone of the project.
# Now contains only functions used throughout the files, but 
# in the future could contain more complex structures.  


#*********************************************************************
import pdfplumber
import docx2txt
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sentence_transformers import SentenceTransformer, models,util
import nltk
from nltk.tokenize import sent_tokenize, wordpunct_tokenize
nltk.download("punkt")



def reading_word(string):
    text = docx2txt.process("var.docx")
    return text

def reading_pdf(string):
    all_text=""
    with pdfplumber.open(string) as pdf:
        for pdf_page in pdf.pages:
            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
            single_page_text = bold.extract_text(x_tolerance=2)   
        #print( single_page_text )
        # separate each page's text with newline
            all_text = all_text + '\n' + single_page_text
    return all_text


def reading_file(string):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. 
    For the moment we detect only: PDF and Words.

    Returns: Long string with all the sentences in the document

    -----------------------------------------------------------------------------
    
    Input:

    string: path of the file we want to analyze

    """

    ext = os.path.splitext(string)[-1].lower()
    if ext == ".pdf":
        text=reading_pdf(string)
    elif ext == ".docx":
        text=reading_word(string)
    else:
        print ("Unknown file format.")
    return text


def splitting(word: str, text):
    if word=="line":
        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
    elif word=="sentences":
        #tok_text1=text.split('. ') 
        tok_text=sent_tokenize(text)
    elif word=="paragraphs":
        tok_text=re.split(r'\n{2,}', text)
        for i in tok_text:
            if len(i)<50:
                tok_text.remove(i)

    elif word=="words":
        tok_text=wordpunct_tokenize(text)
    return tok_text


def filtering(text):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the string obtained in the reading step and filters out undesired characters. 

    Potential things to filter: Index of contents, titles, formulas, references, tables (?) 
    
    
    Returns: Long string with all the sentences in the document.

    -----------------------------------------------------------------------------
    
    Input:

    string: string obtained in the previous reading step.

    """    
    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
    clean1=re.sub("\no |\n\uf0b7","",clean1)
    #clean1=re.sub(" \n"," ",clean1)
    return clean1


def ctrlf(words: list, text):
    b=[]
    for word in words:
        #print("Sentences matching the word ", word, ":\n")
        a=re.findall(f"[^.]* {word} [^.]*\.", text) 
        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
        for i in range(len(a)):
            #print(i+1,".-", a[i])
            b = b + [a[i]]
        #print("--------------------------------------------------")
    return b


def everything_vs_word(query, corpus, model_name, number=5, score_function=util.cos_sim, ax=None):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the text that we want to compare, the query with respect to we want to 
    compare, and then the number of comparisons we wanna show (by defect 5), the model used, and the metric used
    to compute the similarity (by defect cosine similarity).

    Returns: Histogram plot

    -----------------------------------------------------------------------------
    
    Input:

    query: String
    corpus: String or list of strings (usually the latter for a document --> list of sentences)
    number: Int
    model_name: String
    score_function: Function
    ax: Axis object

    """

    # model info retrieval
    model = SentenceTransformer(model_name)
    n=len(query)

    # tokenize according to the model 
    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)   

    # semantic search gives a list of lists composed of dictionaries
    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
    hits = hits[0]
    #print("Comparing ", query, " VS:")
    
    scoring=[]
    corp=[]
    for hit in hits:  
        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
        scoring.append(hit['score'])
        corp.append(corpus[hit['corpus_id']])
    
    # defining dataframe for easiness in plotting
    data = pd.DataFrame(np.column_stack([corp, scoring]), 
                               columns=['Expression', 'Score'])
    data.sort_values(by=['Score'], ascending=False)
    data = data.explode('Score')
    data['Score'] = data['Score'].astype('float')

    return sns.barplot(data=data.reset_index(), ax=ax, x='Score', y='Expression')


def sim(query, corpus, model_name, number=5, score_function=util.cos_sim):
    # model info retrieval
    model = SentenceTransformer(model_name)
    n=len(query)

    # tokenize according to the model 
    corpus_embedding = model.encode(corpus, convert_to_tensor=True)
    query_embedding = model.encode(query, convert_to_tensor=True)   

    # semantic search gives a list of lists composed of dictionaries
    hits = util.semantic_search(query_embedding, corpus_embedding,top_k=number,score_function=score_function)
    hits = hits[0]
    #print("Comparing ", query, " VS:")
    
    scoring=[]
    corp=[]
    for hit in hits:  
        #print(corpus[hit['corpus_id']], "(Score: {:.4f})".format(hit['score']))
        scoring.append(hit['score'])
        corp.append(corpus[hit['corpus_id']])
    
    # defining dataframe for easiness in plotting
    data = pd.DataFrame(np.column_stack([corp, scoring]), 
                               columns=['Expression', 'Score'])
    data.sort_values(by=['Score'], ascending=False)
    data = data.explode('Score')
    data['Score'] = data['Score'].astype('float')
    return data


def sim_2(query: list, corpus, model_name, threshold,number=5, score_function=util.cos_sim):
    frames=[]
    for i in query:
        frames = frames + [functions.sim(query[i], corpus, model_name=model_name, number=number, score_function=util.cos_sim)]
    
    result = pd.DataFrame(frames)
    result=result.sort_values(by=['Score'], ascending=False)
    result.drop_duplicates(subset=['Expression'], inplace=True)
    return result


############ EXTRA BALL ################
# detecting the conclusion and getting all the sentences of that paragraph for future use.
def conclusion():
    return 


########## Get a function with the distribution of the results per word