Marthee commited on
Commit
7e8489d
·
verified ·
1 Parent(s): 133b1c7

Update tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +15 -6
tsadropboxretrieval.py CHANGED
@@ -8,7 +8,7 @@ Original file is located at
8
  # !pip install dropbox -q
9
 
10
  # pip install pymupdf #==1.22.5
11
-
12
  import base64
13
  import requests
14
  import json
@@ -248,7 +248,6 @@ def getPDFData(path):
248
  data = res.content
249
  return data
250
 
251
-
252
  def retrieveProjects(projname, progress_callback=None):
253
  # if progress_callback:
254
  progress_callback(20)
@@ -273,11 +272,22 @@ def retrieveProjects(projname, progress_callback=None):
273
  parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
274
  if progress_callback:
275
  progress_callback(50)
 
276
  # Filter using the lowercase column but retrieve the original paths
277
- # mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
278
- mask = parquetDf['path_display_lower'].apply(lambda x: projname in x and '01 project details' in x)
 
 
 
 
 
 
 
 
 
 
279
  if progress_callback:
280
- progress_callback(60)
281
  # Retrieve the original (case-sensitive) paths before lowering them
282
  RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
283
  documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
@@ -297,5 +307,4 @@ def retrieveProjects(projname, progress_callback=None):
297
  if progress_callback:
298
  progress_callback(80)
299
  return documentsToMeasure, RelevantDocuments, extracted_path
300
-
301
 
 
8
  # !pip install dropbox -q
9
 
10
  # pip install pymupdf #==1.22.5
11
+ import os
12
  import base64
13
  import requests
14
  import json
 
248
  data = res.content
249
  return data
250
 
 
251
  def retrieveProjects(projname, progress_callback=None):
252
  # if progress_callback:
253
  progress_callback(20)
 
272
  parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
273
  if progress_callback:
274
  progress_callback(50)
275
+ ##### Updated code #######
276
  # Filter using the lowercase column but retrieve the original paths
277
+ def path_matches(x):
278
+ #remove the file name from the path
279
+ folder_path = os.path.dirname(x)
280
+ #Check if project number (like /2564) appears in folder path
281
+ has_projnum = projname in folder_path
282
+ #Check if "01 project details" appears in folder path
283
+ has_details = '01 project details' in folder_path
284
+ return has_projnum and has_details
285
+
286
+ #Apply the mask
287
+ mask = parquetDf['path_display_lower'].apply(path_matches)
288
+ #### End of updated code #####
289
  if progress_callback:
290
+ progress_callback(60)
291
  # Retrieve the original (case-sensitive) paths before lowering them
292
  RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
293
  documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
 
307
  if progress_callback:
308
  progress_callback(80)
309
  return documentsToMeasure, RelevantDocuments, extracted_path
 
310