Marthee commited on
Commit
7883bee
·
verified ·
1 Parent(s): 7938d02

Update tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +80 -51
tsadropboxretrieval.py CHANGED
@@ -1,8 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
3
-
4
  Automatically generated by Colaboratory.
5
-
6
  Original file is located at
7
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
8
  """
@@ -23,8 +21,14 @@ import io
23
  import re
24
  import pyarrow
25
  from io import BytesIO
 
 
 
 
 
 
 
26
 
27
- """### NEW CODE - OCTOBER 26 - Marthe"""
28
 
29
  files_list=[]
30
 
@@ -101,6 +105,7 @@ def dropbox_upload_file(df, flag=0):
101
  print('Error uploading file to Dropbox: ' + str(e))
102
  return dbxTeam
103
 
 
104
  def check_if_file_exists(dbxTeam,path):
105
  try:
106
  md = dbxTeam.files_get_metadata(path)
@@ -109,7 +114,25 @@ def check_if_file_exists(dbxTeam,path):
109
  except Exception as error_response:
110
  exists_bool = False
111
  return exists_bool
112
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def uploadanyFile(doc,pdfname,path,flag=0):
114
  try:
115
  dbxTeam= ADR_Access_DropboxTeam('admin')
@@ -131,11 +154,10 @@ def uploadanyFile(doc,pdfname,path,flag=0):
131
  dbxTeam.files_delete(path)
132
  meta=dbxTeam.files_upload(doc.write() ,path)
133
  try:
134
- print('hereintry')
135
  shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
136
  except:
137
- print('hereinexcept')
138
  shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
 
139
  return shared_link_metadata.url
140
  except Exception as e:
141
  print('Error uploading file to Dropbox: ' + str(e))
@@ -181,30 +203,14 @@ def GetParquetDF():
181
  return df
182
 
183
 
184
- def getPathtoPDF_File(nameofPDF):
185
- parquetDf=GetParquetDF()
186
- nameofPDF=nameofPDF.replace('"', '')
187
- try:
188
- path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
189
- link=getSharedLink(path)
190
- print(path,link)
191
- except:
192
- return 'Project does not exist'
193
- return path,link
194
- # parquetDf
195
-
196
- # getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf')
197
- def getPDFData(path):
198
- dbxTeam= ADR_Access_DropboxTeam('admin')
199
- md, res =dbxTeam.files_download(path)
200
- data = res.content
201
- return data
202
-
203
- def getPathtoPDF_File(nameofPDF):
204
- parquetDf=GetParquetDF()
205
  nameofPDF=nameofPDF.replace('"', '')
206
  try:
207
- path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
 
 
 
208
  link=getSharedLink(path)
209
  print(path,link)
210
  except:
@@ -219,26 +225,49 @@ def getPDFData(path):
219
  data = res.content
220
  return data
221
 
222
- def retrieveProjects(projname):
223
- print('retrieve',projname)
224
-
225
- projname='/'+projname.split(' ')[0]
226
- projname=projname.replace('/"', '')
227
- print(projname)
228
- parquetDf=GetParquetDF()
229
- documentsToMeasure = []
230
- RelevantDocuments = []
231
- parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
232
-
233
- # Filter based on the presence of '/2221' and '01 Project Details'
234
- mask = parquetDf['path_display_lower'].apply(lambda x: projname in x and '01 project details' in x)
235
- print(mask)
236
- # Filter RelevantDocuments and documentsToMeasure using the mask
237
- RelevantDocuments = parquetDf[mask][['name', 'path_display_lower']].values.tolist()
238
- documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Filter documentsToMeasure for PDF files later if needed
239
- print(documentsToMeasure)
240
- # Remove the temporary 'path_display_lower' column
241
- parquetDf.drop(columns=['path_display_lower'], inplace=True)
242
- print(len(documentsToMeasure))
243
- return documentsToMeasure,RelevantDocuments
244
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
 
3
  Automatically generated by Colaboratory.
 
4
  Original file is located at
5
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
6
  """
 
21
  import re
22
  import pyarrow
23
  from io import BytesIO
24
+ from functools import lru_cache
25
+
26
+
27
+
28
+ @lru_cache(maxsize=1)
29
+ def load_parquet_df():
30
+ return GetParquetDF()
31
 
 
32
 
33
  files_list=[]
34
 
 
105
  print('Error uploading file to Dropbox: ' + str(e))
106
  return dbxTeam
107
 
108
+
109
  def check_if_file_exists(dbxTeam,path):
110
  try:
111
  md = dbxTeam.files_get_metadata(path)
 
114
  except Exception as error_response:
115
  exists_bool = False
116
  return exists_bool
117
+ def uploadmarkupPDFTable(doc,pdfname,path):
118
+ dbxTeam= ADR_Access_DropboxTeam('admin')
119
+ try:
120
+ path=path+pdfname
121
+ exists_bool=check_if_file_exists(dbxTeam,path)
122
+ if exists_bool:
123
+ print("if gowa el else <3")
124
+ dbxTeam.files_delete(path)
125
+ print("abl el meta <3")
126
+ meta=dbxTeam.files_upload(doc.read() ,path)
127
+ try:
128
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
129
+ except:
130
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
131
+ # print(shared_link_metadata.url)
132
+ return shared_link_metadata.url
133
+ except Exception as e:
134
+ print('Error uploading file to Dropbox: ' + str(e))
135
+
136
  def uploadanyFile(doc,pdfname,path,flag=0):
137
  try:
138
  dbxTeam= ADR_Access_DropboxTeam('admin')
 
154
  dbxTeam.files_delete(path)
155
  meta=dbxTeam.files_upload(doc.write() ,path)
156
  try:
 
157
  shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
158
  except:
 
159
  shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
160
+ # print(shared_link_metadata.url)
161
  return shared_link_metadata.url
162
  except Exception as e:
163
  print('Error uploading file to Dropbox: ' + str(e))
 
203
  return df
204
 
205
 
206
+ def getPathtoPDF_File(nameofPDF,progress_callback=None):
207
+ parquetDf = load_parquet_df()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
  nameofPDF=nameofPDF.replace('"', '')
209
  try:
210
+ # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
211
+ path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
212
+ if progress_callback:
213
+ progress_callback(60)
214
  link=getSharedLink(path)
215
  print(path,link)
216
  except:
 
225
  data = res.content
226
  return data
227
 
228
+ def retrieveProjects(projname, progress_callback=None):
229
+ # if progress_callback:
230
+ progress_callback(20)
231
+
232
+ projname = '/' + projname.split(' ')[0] # Extract main project name
233
+ projname = projname.replace('/"', '') # Remove unwanted characters
234
+ print('projname', projname)
235
+
236
+ parquetDf = load_parquet_df()
237
+ documentsToMeasure = []
238
+ RelevantDocuments = []
239
+
240
+ # Send progress update (20%)
241
+ if progress_callback:
242
+ progress_callback(40)
243
+
244
+ # Store the original path before converting it to lowercase
245
+ parquetDf['original_path_display'] = parquetDf['path_display']
246
+
247
+ # Create a lowercase column for case-insensitive matching
248
+ parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
249
+ if progress_callback:
250
+ progress_callback(50)
251
+ # Filter using the lowercase column but retrieve the original paths
252
+ mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x)
253
+ if progress_callback:
254
+ progress_callback(60)
255
+ # Retrieve the original (case-sensitive) paths before lowering them
256
+ RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
257
+ documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
258
+
259
+ # Send progress update (80%)
260
+ if progress_callback:
261
+ progress_callback(70)
262
+
263
+ # Extract path from the original (case-sensitive) column
264
+ if RelevantDocuments:
265
+ extracted_path = RelevantDocuments[0][1].split("01 Project Details")[0] + "01 Project Details"
266
+ else:
267
+ extracted_path = None # Handle case when no match is found
268
+
269
+ # Remove temporary columns
270
+ parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
271
+ if progress_callback:
272
+ progress_callback(80)
273
+ return documentsToMeasure, RelevantDocuments, extracted_path