Marthee commited on
Commit
96b07cc
·
verified ·
1 Parent(s): 466f9bd

Update tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +87 -28
tsadropboxretrieval.py CHANGED
@@ -126,10 +126,10 @@ def uploadanyFile(doc,pdfname,path,flag=0):
126
 
127
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
128
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
129
- def DropboxItemstoDF():
130
  files_list=[]
131
  dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
132
- folder_path = "/TSA Team Folder"
133
  res = dbxTeam.files_list_folder(path=folder_path, recursive=True )
134
  # df1=handle_entries(res.entries , files_list)
135
  if res.has_more:
@@ -138,7 +138,7 @@ def DropboxItemstoDF():
138
  df2=handle_entries(res.entries , files_list)
139
 
140
  # dbxTeam=dropbox_upload_file(df2)
141
- print(df2)
142
  return df2 , files_list
143
 
144
  # df2=DropboxItemstoDF()
@@ -149,19 +149,41 @@ def DropboxItemstoDF():
149
  # fthr=pd.read_feather('df2.feather')
150
  # return fthr
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  def GetParquetDF():
153
- # # read the parquet file in current directory, back into a pandas data frame
154
- dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
155
- try:
156
- shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
157
- except:
158
- shared_link_metadata=dbxTeam.sharing_create_shared_link( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
159
- metadata, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
160
- data=res.content # or res.content, or iter_content, or iter_lines, etc. as needed
 
 
 
 
 
 
 
 
 
 
161
 
162
- pq_file = io.BytesIO(data)
163
- df = pd.read_parquet(pq_file)
164
- return df
165
 
166
  def getPathtoPDF_File(nameofPDF):
167
  parquetDf=GetParquetDF()
@@ -181,20 +203,57 @@ def getPDFData(path):
181
  data = res.content
182
  return data
183
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
184
  def retrieveProjects(projname):
185
  print('retrieve')
 
186
  parquetDf=GetParquetDF()
187
- print('p',parquetDf)
188
- documentsToMeasure=[]
189
- RelevantDocuments=[]
190
- projnameWithDetails=''
191
- projnameWithDetails=projname+' 01 Project Details'
192
- matches=re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]' , projnameWithDetails.lower())
193
- for s in parquetDf['path_display']:
194
- if all(x in s.lower() for x in matches):
195
- name=parquetDf.loc[parquetDf['path_display'] == s, 'name'].iloc[0]
196
- path=parquetDf.loc[parquetDf['name'] == name, 'path_display'].iloc[0]
197
- RelevantDocuments.append([name,path])
198
- if name.endswith('.pdf'):
199
- documentsToMeasure.append([name,path])
200
- return documentsToMeasure,RelevantDocuments
 
 
 
 
 
 
 
 
 
 
126
 
127
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
128
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
129
+ def DropboxItemstoDF(folder_path):
130
  files_list=[]
131
  dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
132
+ # folder_path = "/TSA Team Folder"
133
  res = dbxTeam.files_list_folder(path=folder_path, recursive=True )
134
  # df1=handle_entries(res.entries , files_list)
135
  if res.has_more:
 
138
  df2=handle_entries(res.entries , files_list)
139
 
140
  # dbxTeam=dropbox_upload_file(df2)
141
+ # print(df2)
142
  return df2 , files_list
143
 
144
  # df2=DropboxItemstoDF()
 
149
  # fthr=pd.read_feather('df2.feather')
150
  # return fthr
151
 
152
+ # def GetParquetDF():
153
+ # # # read the parquet file in current directory, back into a pandas data frame
154
+ # dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
155
+ # try:
156
+ # shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
157
+ # except:
158
+ # shared_link_metadata=dbxTeam.sharing_create_shared_link( path='/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip')
159
+ # metadata, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
160
+ # data=res.content # or res.content, or iter_content, or iter_lines, etc. as needed
161
+
162
+ # pq_file = io.BytesIO(data)
163
+ # df = pd.read_parquet(pq_file)
164
+ # return df
165
+
166
+
167
  def GetParquetDF():
168
+ # Initialize Dropbox client
169
+ dbxTeam = ADR_Access_DropboxTeam('user') # or pass dbx in parameters
170
+ # Define the path to the Parquet file on Dropbox
171
+ path = '/TSA Team Folder/ADR Test/DropboxDirectory/df.parquet.gzip'
172
+ try:
173
+ # Try to create a shared link with settings
174
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path=path)
175
+ except dropbox.exceptions.ApiError:
176
+ # If settings are not supported, create a shared link without settings
177
+ shared_link_metadata = dbxTeam.sharing_create_shared_link(path=path)
178
+ # Get the file content from the shared link
179
+ _, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
180
+ data = res.content
181
+ # Read the Parquet file content into a pandas DataFrame
182
+ with io.BytesIO(data) as pq_file:
183
+ df = pd.read_parquet(pq_file)
184
+
185
+ return df
186
 
 
 
 
187
 
188
  def getPathtoPDF_File(nameofPDF):
189
  parquetDf=GetParquetDF()
 
203
  data = res.content
204
  return data
205
 
206
+ # def retrieveProjects(projname):
207
+ # print('retrieve')
208
+
209
+ # parquetDf=GetParquetDF()
210
+ # documentsToMeasure = []
211
+ # RelevantDocuments = []
212
+ # projnameWithDetails = f'{projname} 01 Project Details'
213
+
214
+ # # Split the project name into words and convert to lowercase
215
+ # matches = set(re.findall(r'\b\w+\b', projnameWithDetails.lower()))
216
+
217
+ # # Convert the 'path_display' column to lowercase for case-insensitive matching
218
+ # parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
219
+
220
+ # # Check if all words in matches are present in the path
221
+ # mask = parquetDf['path_display_lower'].apply(lambda x: all(match in x for match in matches))
222
+
223
+ # # Filter relevant documents
224
+ # RelevantDocuments = parquetDf.loc[mask, ['name', 'path_display']].values.tolist()
225
+
226
+ # # Filter documents to measure
227
+ # documentsToMeasure = parquetDf.loc[(mask) & (parquetDf['name'].str.endswith('.pdf')), ['name', 'path_display']].values.tolist()
228
+
229
+ # # Remove the temporary 'path_display_lower' column
230
+ # parquetDf.drop(columns=['path_display_lower'], inplace=True)
231
+ # return documentsToMeasure,RelevantDocuments
232
+
233
  def retrieveProjects(projname):
234
  print('retrieve')
235
+
236
  parquetDf=GetParquetDF()
237
+ documentsToMeasure = []
238
+ RelevantDocuments = []
239
+ projnameWithDetails = f'{projname} 01 Project Details'
240
+ # Split the project name into words and convert to lowercase
241
+ matches = set(re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]', projnameWithDetails.lower()))
242
+
243
+ # Convert the 'path_display' column to lowercase for case-insensitive matching
244
+ parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
245
+
246
+ # Create a mask to filter relevant documents
247
+ mask = parquetDf['path_display_lower'].apply(lambda x: all(match in x for match in matches))
248
+
249
+ # Filter RelevantDocuments and documentsToMeasure using the mask
250
+ RelevantDocuments = parquetDf[mask][['name', 'path_display']].values.tolist()
251
+ documentsToMeasure = RelevantDocuments # Filter documentsToMeasure for PDF files later if needed
252
+
253
+ # Remove the temporary 'path_display_lower' column
254
+ parquetDf.drop(columns=['path_display_lower'], inplace=True)
255
+ return documentsToMeasure,RelevantDocuments
256
+
257
+
258
+
259
+