Marthee commited on
Commit
0cd855f
·
verified ·
1 Parent(s): cf4099a

Update tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +20 -10
tsadropboxretrieval.py CHANGED
@@ -1,8 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
3
-
4
  Automatically generated by Colaboratory.
5
-
6
  Original file is located at
7
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
8
  """
@@ -205,12 +203,14 @@ def GetParquetDF():
205
  return df
206
 
207
 
208
- def getPathtoPDF_File(nameofPDF):
209
  parquetDf = load_parquet_df()
210
  nameofPDF=nameofPDF.replace('"', '')
211
  try:
212
  # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
213
  path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
 
 
214
  link=getSharedLink(path)
215
  print(path,link)
216
  except:
@@ -225,9 +225,9 @@ def getPDFData(path):
225
  data = res.content
226
  return data
227
 
228
-
229
- def retrieveProjects(projname):
230
- print('retrieve', projname)
231
 
232
  projname = '/' + projname.split(' ')[0] # Extract main project name
233
  projname = projname.replace('/"', '') # Remove unwanted characters
@@ -237,18 +237,28 @@ def retrieveProjects(projname):
237
  documentsToMeasure = []
238
  RelevantDocuments = []
239
 
 
 
 
 
240
  # Store the original path before converting it to lowercase
241
  parquetDf['original_path_display'] = parquetDf['path_display']
242
 
243
  # Create a lowercase column for case-insensitive matching
244
  parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
245
-
 
246
  # Filter using the lowercase column but retrieve the original paths
247
  mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
248
-
 
249
  # Retrieve the original (case-sensitive) paths before lowering them
250
  RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
251
  documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
 
 
 
 
252
 
253
  # Extract path from the original (case-sensitive) column
254
  if RelevantDocuments:
@@ -258,6 +268,6 @@ def retrieveProjects(projname):
258
 
259
  # Remove temporary columns
260
  parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
261
-
 
262
  return documentsToMeasure, RelevantDocuments, extracted_path
263
-
 
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
 
3
  Automatically generated by Colaboratory.
 
4
  Original file is located at
5
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
6
  """
 
203
  return df
204
 
205
 
206
+ def getPathtoPDF_File(nameofPDF,progress_callback=None):
207
  parquetDf = load_parquet_df()
208
  nameofPDF=nameofPDF.replace('"', '')
209
  try:
210
  # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
211
  path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
212
+ if progress_callback:
213
+ progress_callback(60)
214
  link=getSharedLink(path)
215
  print(path,link)
216
  except:
 
225
  data = res.content
226
  return data
227
 
228
+ def retrieveProjects(projname, progress_callback=None):
229
+ # if progress_callback:
230
+ progress_callback(20)
231
 
232
  projname = '/' + projname.split(' ')[0] # Extract main project name
233
  projname = projname.replace('/"', '') # Remove unwanted characters
 
237
  documentsToMeasure = []
238
  RelevantDocuments = []
239
 
240
+ # Send progress update (20%)
241
+ if progress_callback:
242
+ progress_callback(40)
243
+
244
  # Store the original path before converting it to lowercase
245
  parquetDf['original_path_display'] = parquetDf['path_display']
246
 
247
  # Create a lowercase column for case-insensitive matching
248
  parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
249
+ if progress_callback:
250
+ progress_callback(50)
251
  # Filter using the lowercase column but retrieve the original paths
252
  mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
253
+ if progress_callback:
254
+ progress_callback(60)
255
  # Retrieve the original (case-sensitive) paths before lowering them
256
  RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
257
  documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
258
+
259
+ # Send progress update (80%)
260
+ if progress_callback:
261
+ progress_callback(70)
262
 
263
  # Extract path from the original (case-sensitive) column
264
  if RelevantDocuments:
 
268
 
269
  # Remove temporary columns
270
  parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
271
+ if progress_callback:
272
+ progress_callback(80)
273
  return documentsToMeasure, RelevantDocuments, extracted_path