Marthee commited on
Commit
0fa1e5c
·
1 Parent(s): 6934d4e

Upload doc_search.py

Browse files
Files changed (1) hide show
  1. doc_search.py +220 -0
doc_search.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """Doc_search.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1hyshr_1HJFGUVqKRjK7gfPq75lGIYjsd
8
+ """
9
+
10
+ # !pip install gradio -q
11
+
12
+ import os
13
+ from PIL import Image
14
+ # import gradio as gr
15
+ import fitz
16
+ import glob
17
+ import pandas as pd
18
+ import numpy as np
19
+ import tsadropboxretrieval
20
+ import cv2
21
+ #from db import dropbox_upload_file,dropbox_list_files
22
+ import plotly.express as px
23
+
24
+
25
+
26
+
27
+ #########################################################################################################
28
+ # def pushToDropbox(ip1,ip2):
29
+ # df,img_list=search_docs(ip1,ip2)
30
+ # #push df first
31
+ # print('hi')
32
+ # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'SearchSummary.csv')
33
+ # c=0
34
+ # for p in img_list: #push images gallery
35
+ # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+'.png')
36
+ # c+=1
37
+ # return pop
38
+ #############################################################################################################
39
+ def clear():
40
+ return None,None,None,None
41
+
42
+ ##################################################################################################################
43
+ # def pushAll(ip1,proj):
44
+ # df,img_list=slow_search(ip1,proj)
45
+
46
+ # #push df first
47
+ # #print('hi')
48
+ # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'Searchedproj'+proj+'.csv')
49
+ # c=0
50
+ # for p in img_list: #push images gallery
51
+ # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+proj+'ALL.png')
52
+ # c+=1
53
+
54
+ ###########################################################################################################
55
+ def slow_search(keyword,project): #slow search in all files existing
56
+ if keyword==None:
57
+ return None,None
58
+ else:
59
+ keyword=keyword.upper()
60
+ occ=0
61
+ img_list=[]
62
+ zoom=5
63
+ mat = fitz.Matrix(zoom, zoom)
64
+ df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence'])
65
+ #print([nela for nela in glob.glob("dropbox_plans/"+project+"*.pdf")])
66
+ Documents =tsadropboxretrieval.retrieveProjects(project)[0]
67
+ for filepdf in Documents: #loop for each file in our path
68
+ #open file
69
+ print(filepdf[0])
70
+
71
+ dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
72
+ md, res =dbxTeam.files_download(path=filepdf[1])
73
+ data = res.content
74
+ doc=fitz.open("pdf", data)
75
+ #get no of pages in each doc
76
+ numpages=doc.page_count
77
+ occ=0 #occurrence of the word in each document
78
+ for pageno in range(0,numpages):#loop on each page to search in
79
+ contentt=doc[pageno]
80
+ matched=contentt.search_for(keyword)
81
+ # if matched:
82
+ occ+=len(matched) #collect length of matched words in the whole doc
83
+ #highlight the matched
84
+
85
+ for word in matched:
86
+ contentt.add_highlight_annot(word)
87
+
88
+ if len(matched)>0:
89
+ pix = contentt.get_pixmap(matrix = mat)
90
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
91
+ open_cv_image = np.array(img)
92
+ open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
93
+ img_list.append(open_cv_image)
94
+ #save highlighted op
95
+ #doc.save(f'annotated{filepdf}')
96
+ #save doc name and word occ
97
+ if occ>0:
98
+ df= pd.concat([df,pd.DataFrame([[keyword,filepdf[0],occ]], columns=df.columns)], ignore_index=True)
99
+ #return word occ and unpack images
100
+ return df,img_list
101
+ #######################################################################################################
102
+ def prepare_sunburst():
103
+ try:
104
+ df=tsadropboxretrieval.GetParquetDF()
105
+ print(df)
106
+ # df=dropbox_list_files("").reset_index(drop=True)
107
+
108
+ df[['root','parent','child']]=df['path_display'].str.split('/',n=2,expand=True)
109
+ # # print(values.columns.values) #df[['root','parent','child']]
110
+ # tree=px.sunburst(df,path= ['parent', 'child'],width=700,height=600,title='Dropbox Files Hierarchy')
111
+
112
+ # tree.update_traces(textfont=dict(size=14))
113
+ # tree.write_image('imgsunburstt.png')
114
+ return df
115
+
116
+ except Exception as e:
117
+ print("can't list files "+str(e))
118
+
119
+
120
+
121
+
122
+
123
+
124
+ ########################################################################################################
125
+ def search_docs(keyword,plan): #fast search in a file/couple of files
126
+ if plan==None or keyword==None:
127
+ #print(plan)
128
+ return None,None
129
+ else:
130
+ print('elsee')
131
+ keyword=keyword.upper()
132
+ occ=0
133
+ img_list=[]
134
+ zoom=5
135
+ mat = fitz.Matrix(zoom, zoom)
136
+ df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence'])
137
+
138
+
139
+ for filepdf in plan: #glob.glob("dropbox_plans/*.pdf"): #loop for each file in our path
140
+ #open file
141
+
142
+ pdfpath,pdflink=tsadropboxretrieval.getPathtoPDF_File(nameofPDF=filepdf)
143
+ dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user')
144
+ md, res =dbxTeam.files_download(path=pdfpath)
145
+ data = res.content
146
+ doc=fitz.open("pdf", data)
147
+
148
+ #get no of pages in each doc
149
+ numpages=doc.page_count
150
+ occ=0 #occurrence of the word in each document
151
+ for pageno in range(0,numpages):#loop on each page to search in
152
+ contentt=doc[pageno]
153
+ matched=contentt.search_for(keyword)
154
+ occ+=len(matched) #collect length of matched words in the whole doc
155
+ #highlight the matched
156
+ for word in matched:
157
+ print(word)
158
+ contentt.add_highlight_annot(word)
159
+
160
+ if len(matched)>0:
161
+ pix = contentt.get_pixmap(matrix = mat)
162
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
163
+ open_cv_image = np.array(img)
164
+ open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR)
165
+ img_list.append(open_cv_image)
166
+
167
+ print(df)
168
+ #save highlighted op
169
+ # doc.save(f'annotated{filepdf}')
170
+ #save doc name and word occ
171
+ if occ>0:
172
+
173
+ df= pd.concat([df,pd.DataFrame([[keyword,filepdf,occ]], columns=df.columns)], ignore_index=True)
174
+ # doc.save('newwww.pdf')
175
+
176
+
177
+ return df,img_list
178
+
179
+
180
+ ######################################################################################
181
+
182
+ # repo_list=os.listdir('dropbox_plans')
183
+ # pop=gr.Error('Saving to dropbox')
184
+ # #get projects name from repo list
185
+ # f=[l.split(" ")[0]+" "+l.split(" ")[1]+" "+l.split(" ")[2] for l in repo_list]
186
+ # proj_list=list(set(f))
187
+
188
+ # with gr.Blocks(css="#clear {background: rgba(200,200,0,0.2) } #search {background: orangered}") as demo:
189
+ # with gr.Tabs():
190
+ # with gr.TabItem('File Search'):
191
+ # with gr.Row():
192
+ # with gr.Column():
193
+ # ip11=gr.Textbox(label='keyword')
194
+ # ip21=gr.CheckboxGroup(repo_list,label='plan')
195
+ # b1=gr.Button('Search',elem_id="search")
196
+ # c=gr.Button('Clear',elem_id="clear")
197
+ # with gr.Column():
198
+ # df1=gr.Dataframe(label='Found files')
199
+ # op1=gr.Gallery()
200
+ # save1=gr.Button('Save to Dropbox')
201
+
202
+ # with gr.TabItem('Folder Search'):
203
+ # with gr.Row():
204
+ # with gr.Column():
205
+ # gr.Plot(prepare_sunburst())
206
+ # ip12=gr.Textbox(label='keyword')
207
+ # drop=gr.Dropdown(proj_list,label='project')
208
+ # b2=gr.Button('Search',elem_id="search")
209
+ # with gr.Column():
210
+ # df2=gr.Dataframe(label='Found files')
211
+ # op2=gr.Gallery()
212
+ # save2=gr.Button('Save to Dropbox')
213
+ # b1.click(search_docs,inputs=[ip11,ip21],outputs=[df1,op1])
214
+ # b2.click(slow_search,inputs=[ip12,drop],outputs=[df2,op2])
215
+ # save1.click(pushToDropbox,inputs=[ip11,ip21])
216
+ # save2.click(pushAll,inputs=[ip12,drop])
217
+ # c.click(clear,outputs=[ip11,ip21,df1,op1])
218
+
219
+ # demo.launch(show_error=True)
220
+