# -*- coding: utf-8 -*- """Doc_search.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1hyshr_1HJFGUVqKRjK7gfPq75lGIYjsd """ # !pip install gradio -q import os from PIL import Image # import gradio as gr import fitz import glob import pandas as pd import numpy as np import tsadropboxretrieval import cv2 #from db import dropbox_upload_file,dropbox_list_files import plotly.express as px ######################################################################################################### # def pushToDropbox(ip1,ip2): # df,img_list=search_docs(ip1,ip2) # #push df first # print('hi') # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'SearchSummary.csv') # c=0 # for p in img_list: #push images gallery # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+'.png') # c+=1 # return pop ############################################################################################################# def clear(): return None,None,None,None ################################################################################################################## # def pushAll(ip1,proj): # df,img_list=slow_search(ip1,proj) # #push df first # #print('hi') # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'Searchedproj'+proj+'.csv') # c=0 # for p in img_list: #push images gallery # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+proj+'ALL.png') # c+=1 ########################################################################################################### def slow_search(keyword,project): #slow search in all files existing if keyword==None: return None,None else: keyword=keyword.upper() occ=0 img_list=[] zoom=5 mat = fitz.Matrix(zoom, zoom) df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence']) #print([nela for nela in glob.glob("dropbox_plans/"+project+"*.pdf")]) Documents =tsadropboxretrieval.retrieveProjects(project)[0] for filepdf in Documents: #loop for each file in our path #open file print(filepdf[0]) dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user') md, res =dbxTeam.files_download(path=filepdf[1]) data = res.content doc=fitz.open("pdf", data) #get no of pages in each doc numpages=doc.page_count occ=0 #occurrence of the word in each document for pageno in range(0,numpages):#loop on each page to search in contentt=doc[pageno] matched=contentt.search_for(keyword) # if matched: occ+=len(matched) #collect length of matched words in the whole doc #highlight the matched for word in matched: contentt.add_highlight_annot(word) if len(matched)>0: pix = contentt.get_pixmap(matrix = mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) open_cv_image = np.array(img) open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR) img_list.append(open_cv_image) #save highlighted op #doc.save(f'annotated{filepdf}') #save doc name and word occ if occ>0: df= pd.concat([df,pd.DataFrame([[keyword,filepdf[0],occ]], columns=df.columns)], ignore_index=True) #return word occ and unpack images return df,img_list ####################################################################################################### def prepare_sunburst(): try: df=tsadropboxretrieval.GetParquetDF() print(df) # df=dropbox_list_files("").reset_index(drop=True) df[['root','parent','child']]=df['path_display'].str.split('/',n=2,expand=True) # # print(values.columns.values) #df[['root','parent','child']] # tree=px.sunburst(df,path= ['parent', 'child'],width=700,height=600,title='Dropbox Files Hierarchy') # tree.update_traces(textfont=dict(size=14)) # tree.write_image('imgsunburstt.png') return df except Exception as e: print("can't list files "+str(e)) ######################################################################################################## def search_docs(keyword,plan): #fast search in a file/couple of files if plan==None or keyword==None: #print(plan) return None,None else: print('elsee') keyword=keyword.upper() occ=0 img_list=[] zoom=5 mat = fitz.Matrix(zoom, zoom) df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence']) for filepdf in plan: #glob.glob("dropbox_plans/*.pdf"): #loop for each file in our path #open file pdfpath,pdflink=tsadropboxretrieval.getPathtoPDF_File(nameofPDF=filepdf) dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user') md, res =dbxTeam.files_download(path=pdfpath) data = res.content doc=fitz.open("pdf", data) #get no of pages in each doc numpages=doc.page_count occ=0 #occurrence of the word in each document for pageno in range(0,numpages):#loop on each page to search in contentt=doc[pageno] matched=contentt.search_for(keyword) occ+=len(matched) #collect length of matched words in the whole doc #highlight the matched for word in matched: print(word) contentt.add_highlight_annot(word) if len(matched)>0: pix = contentt.get_pixmap(matrix = mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) open_cv_image = np.array(img) open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR) img_list.append(open_cv_image) print(df) #save highlighted op # doc.save(f'annotated{filepdf}') #save doc name and word occ if occ>0: df= pd.concat([df,pd.DataFrame([[keyword,filepdf,occ]], columns=df.columns)], ignore_index=True) # doc.save('newwww.pdf') return df,img_list ###################################################################################### # repo_list=os.listdir('dropbox_plans') # pop=gr.Error('Saving to dropbox') # #get projects name from repo list # f=[l.split(" ")[0]+" "+l.split(" ")[1]+" "+l.split(" ")[2] for l in repo_list] # proj_list=list(set(f)) # with gr.Blocks(css="#clear {background: rgba(200,200,0,0.2) } #search {background: orangered}") as demo: # with gr.Tabs(): # with gr.TabItem('File Search'): # with gr.Row(): # with gr.Column(): # ip11=gr.Textbox(label='keyword') # ip21=gr.CheckboxGroup(repo_list,label='plan') # b1=gr.Button('Search',elem_id="search") # c=gr.Button('Clear',elem_id="clear") # with gr.Column(): # df1=gr.Dataframe(label='Found files') # op1=gr.Gallery() # save1=gr.Button('Save to Dropbox') # with gr.TabItem('Folder Search'): # with gr.Row(): # with gr.Column(): # gr.Plot(prepare_sunburst()) # ip12=gr.Textbox(label='keyword') # drop=gr.Dropdown(proj_list,label='project') # b2=gr.Button('Search',elem_id="search") # with gr.Column(): # df2=gr.Dataframe(label='Found files') # op2=gr.Gallery() # save2=gr.Button('Save to Dropbox') # b1.click(search_docs,inputs=[ip11,ip21],outputs=[df1,op1]) # b2.click(slow_search,inputs=[ip12,drop],outputs=[df2,op2]) # save1.click(pushToDropbox,inputs=[ip11,ip21]) # save2.click(pushAll,inputs=[ip12,drop]) # c.click(clear,outputs=[ip11,ip21,df1,op1]) # demo.launch(show_error=True)