Update tsadropboxretrieval.py
Browse files- tsadropboxretrieval.py +15 -6
tsadropboxretrieval.py
CHANGED
|
@@ -8,7 +8,7 @@ Original file is located at
|
|
| 8 |
# !pip install dropbox -q
|
| 9 |
|
| 10 |
# pip install pymupdf #==1.22.5
|
| 11 |
-
|
| 12 |
import base64
|
| 13 |
import requests
|
| 14 |
import json
|
|
@@ -248,7 +248,6 @@ def getPDFData(path):
|
|
| 248 |
data = res.content
|
| 249 |
return data
|
| 250 |
|
| 251 |
-
|
| 252 |
def retrieveProjects(projname, progress_callback=None):
|
| 253 |
# if progress_callback:
|
| 254 |
progress_callback(20)
|
|
@@ -273,11 +272,22 @@ def retrieveProjects(projname, progress_callback=None):
|
|
| 273 |
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 274 |
if progress_callback:
|
| 275 |
progress_callback(50)
|
|
|
|
| 276 |
# Filter using the lowercase column but retrieve the original paths
|
| 277 |
-
|
| 278 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
if progress_callback:
|
| 280 |
-
progress_callback(60)
|
| 281 |
# Retrieve the original (case-sensitive) paths before lowering them
|
| 282 |
RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
|
| 283 |
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
|
|
@@ -297,5 +307,4 @@ def retrieveProjects(projname, progress_callback=None):
|
|
| 297 |
if progress_callback:
|
| 298 |
progress_callback(80)
|
| 299 |
return documentsToMeasure, RelevantDocuments, extracted_path
|
| 300 |
-
|
| 301 |
|
|
|
|
| 8 |
# !pip install dropbox -q
|
| 9 |
|
| 10 |
# pip install pymupdf #==1.22.5
|
| 11 |
+
import os
|
| 12 |
import base64
|
| 13 |
import requests
|
| 14 |
import json
|
|
|
|
| 248 |
data = res.content
|
| 249 |
return data
|
| 250 |
|
|
|
|
| 251 |
def retrieveProjects(projname, progress_callback=None):
|
| 252 |
# if progress_callback:
|
| 253 |
progress_callback(20)
|
|
|
|
| 272 |
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 273 |
if progress_callback:
|
| 274 |
progress_callback(50)
|
| 275 |
+
##### Updated code #######
|
| 276 |
# Filter using the lowercase column but retrieve the original paths
|
| 277 |
+
def path_matches(x):
|
| 278 |
+
#remove the file name from the path
|
| 279 |
+
folder_path = os.path.dirname(x)
|
| 280 |
+
#Check if project number (like /2564) appears in folder path
|
| 281 |
+
has_projnum = projname in folder_path
|
| 282 |
+
#Check if "01 project details" appears in folder path
|
| 283 |
+
has_details = '01 project details' in folder_path
|
| 284 |
+
return has_projnum and has_details
|
| 285 |
+
|
| 286 |
+
#Apply the mask
|
| 287 |
+
mask = parquetDf['path_display_lower'].apply(path_matches)
|
| 288 |
+
#### End of updated code #####
|
| 289 |
if progress_callback:
|
| 290 |
+
progress_callback(60)
|
| 291 |
# Retrieve the original (case-sensitive) paths before lowering them
|
| 292 |
RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
|
| 293 |
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
|
|
|
|
| 307 |
if progress_callback:
|
| 308 |
progress_callback(80)
|
| 309 |
return documentsToMeasure, RelevantDocuments, extracted_path
|
|
|
|
| 310 |
|