Marthee commited on
Commit
b4a9a3c
·
1 Parent(s): 7aa2180

Upload tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +184 -0
tsadropboxretrieval.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TSADropboxRetrieval.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
8
+ """
9
+
10
+ # !pip install dropbox -q
11
+
12
+ # pip install pymupdf #==1.22.5
13
+
14
+ import base64
15
+ import requests
16
+ import json
17
+ import pathlib
18
+ import pandas as pd
19
+ import dropbox
20
+ from dropbox.exceptions import AuthError
21
+ import fitz
22
+ import io
23
+ import re
24
+ import pyarrow
25
+
26
+
27
+ """### NEW CODE - OCTOBER 26 - Marthe"""
28
+
29
+ files_list=[]
30
+
31
+ app_key='9bljerefjumct38'
32
+ app_secret='nl6k66clw1j1k12'
33
+ access_code='sl.Bou05Rb15xPy851-I1UV8oOabHPY21AEPl5nrYl-Q0ninFSy0kTuRWPSve_JPbd3Z03E7eBY4r9R454rdzDM0AxLkyqrQEDzyAGUwP7kZ7s2CR6EwvdLD2a7Xh8nFEs38voLTH2IHzrQ2QEx7rji4OJ8aSQStKtJkI7_dh8tYHj5'
34
+ refresh_token='qK2VqvbxWMMAAAAAAAAAAXFQvrHM4xUwWUcZ6l5vGOygn1iAA6zlDjmAQNBbZprL'
35
+ basic_auth=base64.b64encode(f'{app_key}:{app_secret}'.encode())
36
+
37
+ def ADR_Access_DropboxTeam(flag):
38
+ if flag=='user':
39
+ dbxTeam = dropbox.DropboxTeam(app_key=app_key,
40
+ app_secret=app_secret,
41
+ oauth2_refresh_token = refresh_token).as_user('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
42
+ elif flag=='admin':
43
+ dbxTeam = dropbox.DropboxTeam(app_key=app_key,
44
+ app_secret=app_secret,
45
+ oauth2_refresh_token = refresh_token).as_admin('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
46
+ root_namespace_id = dbxTeam.users_get_current_account().root_info.root_namespace_id
47
+ dbxTeam = dbxTeam.with_path_root(dropbox.common.PathRoot.root(root_namespace_id))
48
+ return dbxTeam
49
+
50
+ def getSharedLink(path):
51
+ dbxTeam=ADR_Access_DropboxTeam('user')
52
+ try:
53
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
54
+ except:
55
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
56
+ return shared_link_metadata.url
57
+
58
+ def handle_entries(entries):
59
+
60
+ for file in entries:
61
+ if isinstance(file, dropbox.files.FileMetadata):
62
+ # if str(file.name).endswith(".pdf"):
63
+ metadata = {
64
+ 'name': file.name,
65
+ 'path_display': file.path_display,
66
+ 'client_modified': file.client_modified,
67
+ 'server_modified': file.server_modified
68
+ }
69
+ files_list.append(metadata)
70
+ df = pd.DataFrame.from_records(files_list)
71
+ return df
72
+
73
+ def dropbox_connect():
74
+ """Create a connection to myyyyyyy Dropbox."""
75
+ print('connecy')
76
+ try:
77
+ # print('ayhaga')
78
+ dbxMe = dropbox.Dropbox(
79
+ app_key='67w6ibpa9d2b60x',
80
+ app_secret='d3ecz8g1604fu04',
81
+ oauth2_refresh_token = 'R_LACBBNhysAAAAAAAAAAXt9mMy9OYIV_v4pF45lG6Z8DHNV66rq1q7acWjj_H5g',
82
+
83
+ )
84
+ # dbx=dropbox.Dropbox(access_token)
85
+
86
+ except AuthError as e:
87
+ print('Error connecting to Dropbox with access token: ' + str(e))
88
+ return dbxMe
89
+
90
+
91
+ def dropbox_upload_file(df, flag=0):
92
+ try:
93
+ dbxTeam= ADR_Access_DropboxTeam('admin')
94
+ path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip'
95
+ doc=df.to_parquet()
96
+ meta=dbxTeam.files_upload(doc,path,mode=dropbox.files.WriteMode("overwrite"))
97
+
98
+ except Exception as e:
99
+ print('Error uploading file to Dropbox: ' + str(e))
100
+ return dbxTeam
101
+
102
+
103
+ def uploadanyFile(doc,pdfname,path):
104
+ try:
105
+ dbxTeam= ADR_Access_DropboxTeam('admin')
106
+ path=path+pdfname
107
+ meta=dbxTeam.files_upload( doc.write() ,path,mode=dropbox.files.WriteMode("overwrite"))
108
+ try:
109
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
110
+ except:
111
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
112
+ return shared_link_metadata.url
113
+ except Exception as e:
114
+ print('Error uploading file to Dropbox: ' + str(e))
115
+
116
+ return 'Error uploading file to Dropbox.'
117
+
118
+ # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
119
+ def DropboxItemstoDF():
120
+ dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
121
+ folder_path = "/TSA Team Folder"
122
+ res = dbxTeam.files_list_folder(path=folder_path, recursive=True)
123
+ df1=handle_entries(res.entries)
124
+ while res.has_more:
125
+ res = dbxTeam.files_list_folder_continue(cursor=res.cursor)
126
+ df2=handle_entries(res.entries)
127
+
128
+ dbxTeam=dropbox_upload_file(df2)
129
+ return df2
130
+
131
+ # df2=DropboxItemstoDF()
132
+
133
+ # Feather format for storing data
134
+ # def ToFeather(df2):
135
+ # df2.to_feather('df2.feather')
136
+ # fthr=pd.read_feather('df2.feather')
137
+ # return fthr
138
+
139
+ def GetParquetDF():
140
+ # # read the parquet file in current directory, back into a pandas data frame
141
+ dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
142
+ try:
143
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
144
+ except:
145
+ shared_link_metadata=dbxTeam.sharing_create_shared_link( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
146
+ metadata, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
147
+ data=res.content # or res.content, or iter_content, or iter_lines, etc. as needed
148
+
149
+ pq_file = io.BytesIO(data)
150
+ df = pd.read_parquet(pq_file)
151
+ return df
152
+ def getPathtoPDF_File(nameofPDF):
153
+ parquetDf=GetParquetDF()
154
+ nameofPDF=nameofPDF.replace('"', '')
155
+ try:
156
+ path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
157
+ link=getSharedLink(path)
158
+ except:
159
+ return 'Project does not exist'
160
+ return path,link
161
+ # parquetDf
162
+
163
+ # getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf')
164
+ def getPDFData(path):
165
+ dbxTeam= ADR_Access_DropboxTeam('admin')
166
+ md, res =dbxTeam.files_download(path)
167
+ data = res.content
168
+ return data
169
+
170
+ def retrieveProjects(projname):
171
+ parquetDf=GetParquetDF()
172
+ documnetsToMeasure=[]
173
+ RelevantDocuments=[]
174
+ projnameWithDetails=''
175
+ projnameWithDetails=projname+' 01 Project Details'
176
+ matches=re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]' , projnameWithDetails.lower())
177
+ for s in parquetDf['path_display']:
178
+ if all(x in s.lower() for x in matches):
179
+ name=parquetDf.loc[parquetDf['path_display'] == s, 'name'].iloc[0]
180
+ path=parquetDf.loc[parquetDf['name'] == name, 'path_display'].iloc[0]
181
+ RelevantDocuments.append([name,path])
182
+ if name.endswith('.pdf'):
183
+ documnetsToMeasure.append([name,path])
184
+ return documnetsToMeasure,RelevantDocuments