Spaces:

belgrano91
/

SentenceRecognizer

Build error

App Files Files Community

belgrano91 commited on Dec 1, 2022

Commit

ae89b7e

1 Parent(s): cb0bd86

added functions

Browse files

Files changed (1) hide show

app.py +118 -0

app.py CHANGED Viewed

@@ -3,6 +3,124 @@ import functions
 demo=gr.Blocks()
 with demo:
     gr.Image("logo_credit_agricole_CIB_0.jpg")
     gr.Markdown("## Important Sentences Recognizer")

 demo=gr.Blocks()
+df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
+words=df.values.T[0].tolist()
+def reading_word(string):
+    text = docx2txt.process("var.docx")
+    return text
+def reading_pdf(string):
+    all_text=""
+    with pdfplumber.open(string) as pdf:
+        for pdf_page in pdf.pages:
+            bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
+            single_page_text = bold.extract_text(x_tolerance=2)
+        #print( single_page_text )
+        # separate each page's text with newline
+            all_text = all_text + '\n' + single_page_text
+    return all_text
+def reading_file(file_obj):
+    string=file_obj.orig_name
+    """"
+    -----------------------------------------------------------------------------
+    This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
+    For the moment we detect only: PDF and Words.
+    Returns: Long string with all the sentences in the document
+    -----------------------------------------------------------------------------
+    Input:
+    string: path of the file we want to analyze
+    """
+    ext = os.path.splitext(string)[-1].lower()
+    if ext == ".pdf":
+        text=reading_pdf(string)
+    elif ext == ".docx":
+        text=reading_word(string)
+    else:
+        print ("Unknown file format.")
+    return text
+def filtering(text):
+    """"
+    -----------------------------------------------------------------------------
+    This function takes as arguments the string obtained in the reading step and filters out undesired characters.
+    Potential things to filter: Index of contents, titles, formulas, references, tables (?)
+    Returns: Long string with all the sentences in the document.
+    -----------------------------------------------------------------------------
+    Input:
+    string: string obtained in the previous reading step.
+    """
+    clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
+    clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
+    clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
+    clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
+    clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
+    clean1=re.sub("\no |\n\uf0b7","",clean1)
+    #clean1=re.sub(" \n"," ",clean1)
+    return clean1
+def splitting(word, text):
+    if word=="line":
+        tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
+    elif word=="sentences":
+        #tok_text1=text.split('. ')
+        tok_text=sent_tokenize(text)
+    elif word=="paragraphs":
+        tok_text=text.split('\n\n')
+        #tok_text= [content.strip() for content in text.splitlines() if content]
+    return tok_text
+def ctrlf(words: list, text):
+    b=[]
+    for word in words:
+        #print("Sentences matching the word ", word, ":\n")
+        a=re.findall(f"[^.]* {word} [^.]*\.", text)
+        #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
+        for i in range(len(a)):
+            #print(i+1,".-", a[i])
+            b = b + [a[i]]
+        #print("--------------------------------------------------")
+    return b
+def total(corpus, query, split_param, model_name: str ,number: int, function: str):
+    """
+    Takes filtered text and performs the NLP nalysis
+    """
+    splitted=splitting(split_param, corpus)
+    if function=="cosine similarity":
+        score_function=util.cos_sim
+    elif function=="dot score":
+        score_function=util.dot_score
+    else:
+        print("Choose a valid option")
+    #frames=[]
+    #for i in query:
+    result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)
+    return result
 with demo:
     gr.Image("logo_credit_agricole_CIB_0.jpg")
     gr.Markdown("## Important Sentences Recognizer")