Spaces:
Build error
Build error
Commit ·
ae89b7e
1
Parent(s): cb0bd86
added functions
Browse files
app.py
CHANGED
|
@@ -3,6 +3,124 @@ import functions
|
|
| 3 |
|
| 4 |
demo=gr.Blocks()
|
| 5 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
with demo:
|
| 7 |
gr.Image("logo_credit_agricole_CIB_0.jpg")
|
| 8 |
gr.Markdown("## Important Sentences Recognizer")
|
|
|
|
| 3 |
|
| 4 |
demo=gr.Blocks()
|
| 5 |
|
| 6 |
+
df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
|
| 7 |
+
words=df.values.T[0].tolist()
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def reading_word(string):
|
| 11 |
+
text = docx2txt.process("var.docx")
|
| 12 |
+
return text
|
| 13 |
+
|
| 14 |
+
def reading_pdf(string):
|
| 15 |
+
all_text=""
|
| 16 |
+
with pdfplumber.open(string) as pdf:
|
| 17 |
+
for pdf_page in pdf.pages:
|
| 18 |
+
bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
|
| 19 |
+
single_page_text = bold.extract_text(x_tolerance=2)
|
| 20 |
+
#print( single_page_text )
|
| 21 |
+
# separate each page's text with newline
|
| 22 |
+
all_text = all_text + '\n' + single_page_text
|
| 23 |
+
return all_text
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
def reading_file(file_obj):
|
| 27 |
+
string=file_obj.orig_name
|
| 28 |
+
""""
|
| 29 |
+
-----------------------------------------------------------------------------
|
| 30 |
+
|
| 31 |
+
This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
|
| 32 |
+
For the moment we detect only: PDF and Words.
|
| 33 |
+
|
| 34 |
+
Returns: Long string with all the sentences in the document
|
| 35 |
+
|
| 36 |
+
-----------------------------------------------------------------------------
|
| 37 |
+
|
| 38 |
+
Input:
|
| 39 |
+
|
| 40 |
+
string: path of the file we want to analyze
|
| 41 |
+
|
| 42 |
+
"""
|
| 43 |
+
|
| 44 |
+
ext = os.path.splitext(string)[-1].lower()
|
| 45 |
+
if ext == ".pdf":
|
| 46 |
+
text=reading_pdf(string)
|
| 47 |
+
elif ext == ".docx":
|
| 48 |
+
text=reading_word(string)
|
| 49 |
+
else:
|
| 50 |
+
print ("Unknown file format.")
|
| 51 |
+
return text
|
| 52 |
+
|
| 53 |
+
def filtering(text):
|
| 54 |
+
""""
|
| 55 |
+
-----------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
This function takes as arguments the string obtained in the reading step and filters out undesired characters.
|
| 58 |
+
|
| 59 |
+
Potential things to filter: Index of contents, titles, formulas, references, tables (?)
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
Returns: Long string with all the sentences in the document.
|
| 63 |
+
|
| 64 |
+
-----------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
Input:
|
| 67 |
+
|
| 68 |
+
string: string obtained in the previous reading step.
|
| 69 |
+
|
| 70 |
+
"""
|
| 71 |
+
clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
|
| 72 |
+
clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
|
| 73 |
+
clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
|
| 74 |
+
clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
|
| 75 |
+
clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
|
| 76 |
+
clean1=re.sub("\no |\n\uf0b7","",clean1)
|
| 77 |
+
#clean1=re.sub(" \n"," ",clean1)
|
| 78 |
+
return clean1
|
| 79 |
+
|
| 80 |
+
def splitting(word, text):
|
| 81 |
+
if word=="line":
|
| 82 |
+
tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
|
| 83 |
+
elif word=="sentences":
|
| 84 |
+
#tok_text1=text.split('. ')
|
| 85 |
+
tok_text=sent_tokenize(text)
|
| 86 |
+
elif word=="paragraphs":
|
| 87 |
+
tok_text=text.split('\n\n')
|
| 88 |
+
#tok_text= [content.strip() for content in text.splitlines() if content]
|
| 89 |
+
return tok_text
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def ctrlf(words: list, text):
|
| 93 |
+
b=[]
|
| 94 |
+
for word in words:
|
| 95 |
+
#print("Sentences matching the word ", word, ":\n")
|
| 96 |
+
a=re.findall(f"[^.]* {word} [^.]*\.", text)
|
| 97 |
+
#a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
|
| 98 |
+
for i in range(len(a)):
|
| 99 |
+
#print(i+1,".-", a[i])
|
| 100 |
+
b = b + [a[i]]
|
| 101 |
+
#print("--------------------------------------------------")
|
| 102 |
+
return b
|
| 103 |
+
|
| 104 |
+
|
| 105 |
+
def total(corpus, query, split_param, model_name: str ,number: int, function: str):
|
| 106 |
+
"""
|
| 107 |
+
Takes filtered text and performs the NLP nalysis
|
| 108 |
+
"""
|
| 109 |
+
splitted=splitting(split_param, corpus)
|
| 110 |
+
|
| 111 |
+
if function=="cosine similarity":
|
| 112 |
+
score_function=util.cos_sim
|
| 113 |
+
elif function=="dot score":
|
| 114 |
+
score_function=util.dot_score
|
| 115 |
+
else:
|
| 116 |
+
print("Choose a valid option")
|
| 117 |
+
|
| 118 |
+
#frames=[]
|
| 119 |
+
#for i in query:
|
| 120 |
+
result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)
|
| 121 |
+
|
| 122 |
+
return result
|
| 123 |
+
|
| 124 |
with demo:
|
| 125 |
gr.Image("logo_credit_agricole_CIB_0.jpg")
|
| 126 |
gr.Markdown("## Important Sentences Recognizer")
|