belgrano91 commited on
Commit
ae89b7e
·
1 Parent(s): cb0bd86

added functions

Browse files
Files changed (1) hide show
  1. app.py +118 -0
app.py CHANGED
@@ -3,6 +3,124 @@ import functions
3
 
4
  demo=gr.Blocks()
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  with demo:
7
  gr.Image("logo_credit_agricole_CIB_0.jpg")
8
  gr.Markdown("## Important Sentences Recognizer")
 
3
 
4
  demo=gr.Blocks()
5
 
6
+ df = pd.read_excel('TESTS.xlsx',sheet_name=1) # can also index sheet by name or fetch all sheets
7
+ words=df.values.T[0].tolist()
8
+
9
+
10
+ def reading_word(string):
11
+ text = docx2txt.process("var.docx")
12
+ return text
13
+
14
+ def reading_pdf(string):
15
+ all_text=""
16
+ with pdfplumber.open(string) as pdf:
17
+ for pdf_page in pdf.pages:
18
+ bold=pdf_page.filter(lambda obj: not(obj["object_type"] == "char" and obj["size"]>=10 ))
19
+ single_page_text = bold.extract_text(x_tolerance=2)
20
+ #print( single_page_text )
21
+ # separate each page's text with newline
22
+ all_text = all_text + '\n' + single_page_text
23
+ return all_text
24
+
25
+
26
+ def reading_file(file_obj):
27
+ string=file_obj.orig_name
28
+ """"
29
+ -----------------------------------------------------------------------------
30
+
31
+ This function takes as arguments the file that we want to analyze. Depending the file type we use some python library.
32
+ For the moment we detect only: PDF and Words.
33
+
34
+ Returns: Long string with all the sentences in the document
35
+
36
+ -----------------------------------------------------------------------------
37
+
38
+ Input:
39
+
40
+ string: path of the file we want to analyze
41
+
42
+ """
43
+
44
+ ext = os.path.splitext(string)[-1].lower()
45
+ if ext == ".pdf":
46
+ text=reading_pdf(string)
47
+ elif ext == ".docx":
48
+ text=reading_word(string)
49
+ else:
50
+ print ("Unknown file format.")
51
+ return text
52
+
53
+ def filtering(text):
54
+ """"
55
+ -----------------------------------------------------------------------------
56
+
57
+ This function takes as arguments the string obtained in the reading step and filters out undesired characters.
58
+
59
+ Potential things to filter: Index of contents, titles, formulas, references, tables (?)
60
+
61
+
62
+ Returns: Long string with all the sentences in the document.
63
+
64
+ -----------------------------------------------------------------------------
65
+
66
+ Input:
67
+
68
+ string: string obtained in the previous reading step.
69
+
70
+ """
71
+ clean1=re.sub("\d{1,}.\d{1,}.+","", text) #removing number of the table of contents
72
+ clean1=re.sub("\w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \.{4,} \d{1,}\d{1,}\n|\w{1,} \w{1,} \w{1,} \.{4,} \d{1,}\d{1,}\n","",clean1) #removing number of the table of contents
73
+ clean1=re.sub(" \n\d{1,} \n | \n\d{1,} \n \n |\d{1,}\. \w{1,} \w{1,}", "", clean1)
74
+ clean1=re.sub("\.{4,} \d{1,}|\.{4,} Error! Bookmark not defined.", " ",clean1) #filtering the index
75
+ clean1=re.sub("\n\n\n\n\n+|\n \n+", " ",clean1)#filtering long page jumps
76
+ clean1=re.sub("\no |\n\uf0b7","",clean1)
77
+ #clean1=re.sub(" \n"," ",clean1)
78
+ return clean1
79
+
80
+ def splitting(word, text):
81
+ if word=="line":
82
+ tok_text = list(filter(lambda a: a != '', text)) #Remove empty lines
83
+ elif word=="sentences":
84
+ #tok_text1=text.split('. ')
85
+ tok_text=sent_tokenize(text)
86
+ elif word=="paragraphs":
87
+ tok_text=text.split('\n\n')
88
+ #tok_text= [content.strip() for content in text.splitlines() if content]
89
+ return tok_text
90
+
91
+
92
+ def ctrlf(words: list, text):
93
+ b=[]
94
+ for word in words:
95
+ #print("Sentences matching the word ", word, ":\n")
96
+ a=re.findall(f"[^.]* {word} [^.]*\.", text)
97
+ #a=re.findall(fr"(?i)\b{word}\b [^.]*\.", text) #matching a sentence that contains a word case insensitive
98
+ for i in range(len(a)):
99
+ #print(i+1,".-", a[i])
100
+ b = b + [a[i]]
101
+ #print("--------------------------------------------------")
102
+ return b
103
+
104
+
105
+ def total(corpus, query, split_param, model_name: str ,number: int, function: str):
106
+ """
107
+ Takes filtered text and performs the NLP nalysis
108
+ """
109
+ splitted=splitting(split_param, corpus)
110
+
111
+ if function=="cosine similarity":
112
+ score_function=util.cos_sim
113
+ elif function=="dot score":
114
+ score_function=util.dot_score
115
+ else:
116
+ print("Choose a valid option")
117
+
118
+ #frames=[]
119
+ #for i in query:
120
+ result=functions.sim(query, corpus=splitted, model_name=model_name, number=number, score_function=score_function)
121
+
122
+ return result
123
+
124
  with demo:
125
  gr.Image("logo_credit_agricole_CIB_0.jpg")
126
  gr.Markdown("## Important Sentences Recognizer")