LatestDuplicate_Working

Paused

Marthee commited on Oct 24, 2025

Commit

7e8489d

verified ·

1 Parent(s): 133b1c7

Update tsadropboxretrieval.py

Files changed (1) hide show

tsadropboxretrieval.py CHANGED Viewed

@@ -8,7 +8,7 @@ Original file is located at
 # !pip install dropbox -q
 # pip install pymupdf #==1.22.5
 import base64
 import requests
 import json
@@ -248,7 +248,6 @@ def getPDFData(path):
   data = res.content
   return data
 def retrieveProjects(projname, progress_callback=None):
     # if progress_callback:
     progress_callback(20)
@@ -273,11 +272,22 @@ def retrieveProjects(projname, progress_callback=None):
     parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
     if progress_callback:
         progress_callback(50)
     # Filter using the lowercase column but retrieve the original paths
-    # mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
-    mask = parquetDf['path_display_lower'].apply(lambda x: projname in x and '01 project details' in x)
     if progress_callback:
-        progress_callback(60)
     # Retrieve the original (case-sensitive) paths before lowering them
     RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
     documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')]  # Keep only PDFs
@@ -297,5 +307,4 @@ def retrieveProjects(projname, progress_callback=None):
     if progress_callback:
         progress_callback(80)
     return documentsToMeasure, RelevantDocuments, extracted_path

 # !pip install dropbox -q
 # pip install pymupdf #==1.22.5
+import os
 import base64
 import requests
 import json
   data = res.content
   return data
 def retrieveProjects(projname, progress_callback=None):
     # if progress_callback:
     progress_callback(20)
     parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
     if progress_callback:
         progress_callback(50)
+    ##### Updated code #######
     # Filter using the lowercase column but retrieve the original paths
+    def path_matches(x):
+        #remove the file name from the path
+        folder_path = os.path.dirname(x)
+        #Check if project number (like /2564) appears in folder path
+        has_projnum = projname in folder_path
+        #Check if "01 project details" appears in folder path
+        has_details = '01 project details' in folder_path
+        return has_projnum and has_details
+    #Apply the mask
+    mask = parquetDf['path_display_lower'].apply(path_matches)
+    #### End of updated code #####
     if progress_callback:
+        progress_callback(60)
     # Retrieve the original (case-sensitive) paths before lowering them
     RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
     documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')]  # Keep only PDFs
     if progress_callback:
         progress_callback(80)
     return documentsToMeasure, RelevantDocuments, extracted_path