Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """Doc_search.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1hyshr_1HJFGUVqKRjK7gfPq75lGIYjsd | |
| """ | |
| # !pip install gradio -q | |
| import os | |
| from PIL import Image | |
| # import gradio as gr | |
| import fitz | |
| import glob | |
| import pandas as pd | |
| import numpy as np | |
| import tsadropboxretrieval | |
| import cv2 | |
| #from db import dropbox_upload_file,dropbox_list_files | |
| import plotly.express as px | |
| ######################################################################################################### | |
| # def pushToDropbox(ip1,ip2): | |
| # df,img_list=search_docs(ip1,ip2) | |
| # #push df first | |
| # print('hi') | |
| # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'SearchSummary.csv') | |
| # c=0 | |
| # for p in img_list: #push images gallery | |
| # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+'.png') | |
| # c+=1 | |
| # return pop | |
| ############################################################################################################# | |
| def clear(): | |
| return None,None,None,None | |
| ################################################################################################################## | |
| # def pushAll(ip1,proj): | |
| # df,img_list=slow_search(ip1,proj) | |
| # #push df first | |
| # #print('hi') | |
| # dropbox_upload_file('.',local_file=df,dropbox_file_path='/SearchedDocs/'+ip1+'Searchedproj'+proj+'.csv') | |
| # c=0 | |
| # for p in img_list: #push images gallery | |
| # dropbox_upload_file('.',local_file=np.array(p),dropbox_file_path='/SearchedDocs/'+ip1+str(c)+proj+'ALL.png') | |
| # c+=1 | |
| ########################################################################################################### | |
| def slow_search(keyword,project): #slow search in all files existing | |
| if keyword==None: | |
| return None,None | |
| else: | |
| keyword=keyword.upper() | |
| occ=0 | |
| img_list=[] | |
| zoom=5 | |
| mat = fitz.Matrix(zoom, zoom) | |
| df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence']) | |
| #print([nela for nela in glob.glob("dropbox_plans/"+project+"*.pdf")]) | |
| Documents =tsadropboxretrieval.retrieveProjects(project)[0] | |
| for filepdf in Documents: #loop for each file in our path | |
| #open file | |
| print(filepdf[0]) | |
| dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user') | |
| md, res =dbxTeam.files_download(path=filepdf[1]) | |
| data = res.content | |
| doc=fitz.open("pdf", data) | |
| #get no of pages in each doc | |
| numpages=doc.page_count | |
| occ=0 #occurrence of the word in each document | |
| for pageno in range(0,numpages):#loop on each page to search in | |
| contentt=doc[pageno] | |
| matched=contentt.search_for(keyword) | |
| # if matched: | |
| occ+=len(matched) #collect length of matched words in the whole doc | |
| #highlight the matched | |
| for word in matched: | |
| contentt.add_highlight_annot(word) | |
| if len(matched)>0: | |
| pix = contentt.get_pixmap(matrix = mat) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| open_cv_image = np.array(img) | |
| open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR) | |
| img_list.append(open_cv_image) | |
| #save highlighted op | |
| #doc.save(f'annotated{filepdf}') | |
| #save doc name and word occ | |
| if occ>0: | |
| df= pd.concat([df,pd.DataFrame([[keyword,filepdf[0],occ]], columns=df.columns)], ignore_index=True) | |
| #return word occ and unpack images | |
| return df,img_list | |
| ####################################################################################################### | |
| def prepare_sunburst(): | |
| try: | |
| df=tsadropboxretrieval.GetParquetDF() | |
| print(df) | |
| # df=dropbox_list_files("").reset_index(drop=True) | |
| df[['root','parent','child']]=df['path_display'].str.split('/',n=2,expand=True) | |
| # # print(values.columns.values) #df[['root','parent','child']] | |
| # tree=px.sunburst(df,path= ['parent', 'child'],width=700,height=600,title='Dropbox Files Hierarchy') | |
| # tree.update_traces(textfont=dict(size=14)) | |
| # tree.write_image('imgsunburstt.png') | |
| return df | |
| except Exception as e: | |
| print("can't list files "+str(e)) | |
| ######################################################################################################## | |
| def search_docs(keyword,plan): #fast search in a file/couple of files | |
| if plan==None or keyword==None: | |
| #print(plan) | |
| return None,None | |
| else: | |
| print('elsee') | |
| keyword=keyword.upper() | |
| occ=0 | |
| img_list=[] | |
| zoom=5 | |
| mat = fitz.Matrix(zoom, zoom) | |
| df=pd.DataFrame(columns=['Keyword','Document Name','Word Occurrence']) | |
| for filepdf in plan: #glob.glob("dropbox_plans/*.pdf"): #loop for each file in our path | |
| #open file | |
| pdfpath,pdflink=tsadropboxretrieval.getPathtoPDF_File(nameofPDF=filepdf) | |
| dbxTeam= tsadropboxretrieval.ADR_Access_DropboxTeam('user') | |
| md, res =dbxTeam.files_download(path=pdfpath) | |
| data = res.content | |
| doc=fitz.open("pdf", data) | |
| #get no of pages in each doc | |
| numpages=doc.page_count | |
| occ=0 #occurrence of the word in each document | |
| for pageno in range(0,numpages):#loop on each page to search in | |
| contentt=doc[pageno] | |
| matched=contentt.search_for(keyword) | |
| occ+=len(matched) #collect length of matched words in the whole doc | |
| #highlight the matched | |
| for word in matched: | |
| print(word) | |
| contentt.add_highlight_annot(word) | |
| if len(matched)>0: | |
| pix = contentt.get_pixmap(matrix = mat) | |
| img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
| open_cv_image = np.array(img) | |
| open_cv_image = cv2.cvtColor(open_cv_image, cv2.COLOR_RGB2BGR) | |
| img_list.append(open_cv_image) | |
| print(df) | |
| #save highlighted op | |
| # doc.save(f'annotated{filepdf}') | |
| #save doc name and word occ | |
| if occ>0: | |
| df= pd.concat([df,pd.DataFrame([[keyword,filepdf,occ]], columns=df.columns)], ignore_index=True) | |
| # doc.save('newwww.pdf') | |
| return df,img_list | |
| ###################################################################################### | |
| # repo_list=os.listdir('dropbox_plans') | |
| # pop=gr.Error('Saving to dropbox') | |
| # #get projects name from repo list | |
| # f=[l.split(" ")[0]+" "+l.split(" ")[1]+" "+l.split(" ")[2] for l in repo_list] | |
| # proj_list=list(set(f)) | |
| # with gr.Blocks(css="#clear {background: rgba(200,200,0,0.2) } #search {background: orangered}") as demo: | |
| # with gr.Tabs(): | |
| # with gr.TabItem('File Search'): | |
| # with gr.Row(): | |
| # with gr.Column(): | |
| # ip11=gr.Textbox(label='keyword') | |
| # ip21=gr.CheckboxGroup(repo_list,label='plan') | |
| # b1=gr.Button('Search',elem_id="search") | |
| # c=gr.Button('Clear',elem_id="clear") | |
| # with gr.Column(): | |
| # df1=gr.Dataframe(label='Found files') | |
| # op1=gr.Gallery() | |
| # save1=gr.Button('Save to Dropbox') | |
| # with gr.TabItem('Folder Search'): | |
| # with gr.Row(): | |
| # with gr.Column(): | |
| # gr.Plot(prepare_sunburst()) | |
| # ip12=gr.Textbox(label='keyword') | |
| # drop=gr.Dropdown(proj_list,label='project') | |
| # b2=gr.Button('Search',elem_id="search") | |
| # with gr.Column(): | |
| # df2=gr.Dataframe(label='Found files') | |
| # op2=gr.Gallery() | |
| # save2=gr.Button('Save to Dropbox') | |
| # b1.click(search_docs,inputs=[ip11,ip21],outputs=[df1,op1]) | |
| # b2.click(slow_search,inputs=[ip12,drop],outputs=[df2,op2]) | |
| # save1.click(pushToDropbox,inputs=[ip11,ip21]) | |
| # save2.click(pushAll,inputs=[ip12,drop]) | |
| # c.click(clear,outputs=[ip11,ip21,df1,op1]) | |
| # demo.launch(show_error=True) | |