Spaces:

belgrano91
/

SentenceRecognizer

Build error

File size: 6,596 Bytes

import gradio as gr
import functions
import docx2txt
import pdfplumber
import re
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize
nltk.download("punkt")
from sentence_transformers import SentenceTransformer,util
import numpy as np




df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
words=df.values.T[0].tolist() 


def reading_word(string):
    text = docx2txt.process("var.docx")
    return text

def reading_pdf(string):
    all_text=""
    with pdfplumber.open(string) as pdf:
        for pdf_page in pdf.pages:
            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
            single_page_text = bold.extract_text(x_tolerance=2)   
        #print( single_page_text )
        # separate each page's text with newline
            all_text = all_text + '\n' + single_page_text
    return all_text


def reading_file(file_obj):
    string=file_obj.orig_name
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library. 
    For the moment we detect only: PDF and Words.

    Returns: Long string with all the sentences in the document

    -----------------------------------------------------------------------------
    
    Input:

    string: path of the file we want to analyze

    """

    ext = os.path.splitext(string)[-1].lower()
    if ext == ".pdf":
        text=reading_pdf(string)
    elif ext == ".docx":
        text=reading_word(string)
    else:
        print ("Unknown file format.")
    return text

def filtering(text):
    """"
    -----------------------------------------------------------------------------
    
    This function takes as arguments the string obtained in the reading step and filters out undesired characters. 

    Potential things to filter: Index of contents, titles, formulas, references, tables (?) 
    
    
    Returns: Long string with all the sentences in the document.

    -----------------------------------------------------------------------------
    
    Input:

    string: string obtained in the previous reading step.

    """    
    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
    clean1=re.sub("\no |\n\uf0b7","",clean1)
    #clean1=re.sub(" \n"," ",clean1)
    return clean1

def splitting(word, text):
    if word=="line":
        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
    elif word=="sentences":
        #tok_text1=text.split('. ') 
        tok_text=sent_tokenize(text)
    elif word=="paragraphs":
        tok_text=text.split('\n\n')
        #tok_text= [content.strip() for content in text.splitlines() if content]
    return tok_text


def ctrlf(words: list, text):
    b=[]
    for word in words:
        #print("Sentences matching the word ", word, ":\n")
        a=re.findall(f"[^.]* {word} [^.]*\.", text) 
        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
        for i in range(len(a)):
            #print(i+1,".-", a[i])
            b = b + [a[i]]
        #print("--------------------------------------------------")
    return b


def total(corpus, query, split_param, model_name: str ,number: int, function: str):
    """
    Takes filtered text and performs the NLP nalysis
    """
    splitted=splitting(split_param, corpus)

    if function=="cosine similarity":
        score_function=util.cos_sim
    elif function=="dot score":
        score_function=util.dot_score
    else:
        print("Choose a valid option")

    #frames=[]
    #for i in query:
    result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)

    return result
    
demo=gr.Blocks()
with demo:    
    gr.Image("logo_credit_agricole_CIB_0.jpg")
    gr.Markdown("## Important Sentences Recognizer")
    gr.Markdown("This project aims to retrieve critical sentences related with some important words in a document.")
#gr.Interface(fn=reading_file, inputs=gr.File(),  outputs="text")
    with gr.Box():
        with gr.Row():
            file=gr.File()
            with gr.Column():
                b1=gr.Button("Reading file",variant="primary")
                t1=gr.Textbox(label="Result")
        
                b2=gr.Button("Filtering")
                t2=gr.Textbox(label="Result")
   
    gr.Markdown("Now we run ctrl+f method.")
    with gr.Box():
        checkbox1=gr.CheckboxGroup(words, label="Select desired words")
        b4=gr.Button("Run analysis")
        t4=gr.Textbox(label="Result")

    gr.Markdown("But first we need to choose how to parse the text.")
    with gr.Box():
        t=gr.Textbox(label="Write: sentences or paragraphs or lines or words", value="sentences")
        #radio1=gr.Radio(["lines", "sentences", "paragraphs"], label="Parse by", value="sentences", interactive=True),
        b3=gr.Button("Split text")
        t3=gr.Textbox(label="Result")


    gr.Markdown("Using previous the previous result, we run now the NLP analysis.")
    with gr.Box():
        
        gr.Markdown("Now we will proceed with the analysis.")
        dropdown1=gr.Dropdown(choices=["all-MiniLM-L6-v2","multi-qa-mpnet-base-dot-v1","msmarco-distilbert-base-v4"], label="Model")
        slider1=gr.Slider(1,100,10, label="Top k", interactive=True, step=1)
        dropdown2=gr.Dropdown(choices=["cosine similarity","dot product"], label="Similarity function")
        b5=gr.Button("Run analysis", variant="primary")


    gr.Markdown
    df1= gr.Dataframe(row_count = (1, "dynamic"), col_count=(2, "fixed"), label="Important sentences", headers=["Expression", "Score"], overflow_row_behaviour="paginate")

    b1.click(reading_file, inputs=file, outputs=t1)
    b2.click(filtering, inputs=t1, outputs=t2)
    b3.click(splitting, inputs=[t, t2], outputs=t3)
    b4.click(ctrlf,[checkbox1, t2], t4)
    b5.click(fn=total, inputs=[t2, t4,t, dropdown1, slider1, dropdown2], outputs=df1)           
demo.launch()