Marthee commited on
Commit
52bfe43
·
verified ·
1 Parent(s): 39c4ab6

Update tsadropboxretrieval.py

Browse files
Files changed (1) hide show
  1. tsadropboxretrieval.py +153 -55
tsadropboxretrieval.py CHANGED
@@ -1,8 +1,6 @@
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
3
-
4
  Automatically generated by Colaboratory.
5
-
6
  Original file is located at
7
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
8
  """
@@ -10,7 +8,7 @@ Original file is located at
10
  # !pip install dropbox -q
11
 
12
  # pip install pymupdf #==1.22.5
13
-
14
  import base64
15
  import requests
16
  import json
@@ -23,8 +21,16 @@ import io
23
  import re
24
  import pyarrow
25
  from io import BytesIO
 
 
 
 
 
 
 
 
 
26
 
27
- """### NEW CODE - OCTOBER 26 - Marthe"""
28
 
29
  files_list=[]
30
 
@@ -101,6 +107,7 @@ def dropbox_upload_file(df, flag=0):
101
  print('Error uploading file to Dropbox: ' + str(e))
102
  return dbxTeam
103
 
 
104
  def check_if_file_exists(dbxTeam,path):
105
  try:
106
  md = dbxTeam.files_get_metadata(path)
@@ -109,39 +116,93 @@ def check_if_file_exists(dbxTeam,path):
109
  except Exception as error_response:
110
  exists_bool = False
111
  return exists_bool
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- def uploadanyFile(doc,pdfname,path,flag=0):
114
  try:
115
- dbxTeam= ADR_Access_DropboxTeam('admin')
116
- print('ppp')
117
- if flag: #tree = doc
118
- pdfname=str(pdfname).split('.pdf')[0]+'.xml'
119
- path=path+pdfname
120
- print(path)
121
- f = BytesIO()
122
- doc.write(f, encoding='utf-8', xml_declaration=True)
123
- exists_bool=check_if_file_exists(dbxTeam,path)
124
- if exists_bool:
125
- dbxTeam.files_delete(path)
126
- meta=dbxTeam.files_upload(f.getvalue() ,path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  else:
128
- path=path+pdfname
129
- exists_bool=check_if_file_exists(dbxTeam,path)
130
- if exists_bool:
131
- dbxTeam.files_delete(path)
132
- meta=dbxTeam.files_upload(doc.write() ,path)
 
 
 
 
 
 
 
 
 
 
 
 
133
  try:
134
- print('hereintry')
135
- shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
136
  except:
137
- print('hereinexcept')
138
- shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
139
  return shared_link_metadata.url
 
140
  except Exception as e:
141
  print('Error uploading file to Dropbox: ' + str(e))
142
 
143
  return 'Error uploading file to Dropbox.'
144
 
 
145
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
146
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
147
  def DropboxItemstoDF(folder_path):
@@ -181,11 +242,14 @@ def GetParquetDF():
181
  return df
182
 
183
 
184
- def getPathtoPDF_File(nameofPDF):
185
- parquetDf=GetParquetDF()
186
  nameofPDF=nameofPDF.replace('"', '')
187
  try:
188
- path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
 
 
 
189
  link=getSharedLink(path)
190
  print(path,link)
191
  except:
@@ -200,29 +264,63 @@ def getPDFData(path):
200
  data = res.content
201
  return data
202
 
203
- def retrieveProjects(projname):
204
- print('retrieve')
205
-
206
- parquetDf=GetParquetDF()
207
- documentsToMeasure = []
208
- RelevantDocuments = []
209
- projnameWithDetails = f'{projname} 01 Project Details'
210
- # Split the project name into words and convert to lowercase
211
- matches = set(re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]', projnameWithDetails.lower()))
212
-
213
- # Convert the 'path_display' column to lowercase for case-insensitive matching
214
- parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
215
-
216
- # Create a mask to filter relevant documents
217
- mask = parquetDf['path_display_lower'].apply(lambda x: all(match in x for match in matches))
218
-
219
- # Filter RelevantDocuments and documentsToMeasure using the mask
220
- RelevantDocuments = parquetDf[mask][['name', 'path_display']].values.tolist()
221
- documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Filter documentsToMeasure for PDF files later if needed
222
-
223
- # Remove the temporary 'path_display_lower' column
224
- parquetDf.drop(columns=['path_display_lower'], inplace=True)
225
- print('done')
226
- return documentsToMeasure,RelevantDocuments
227
-
228
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # -*- coding: utf-8 -*-
2
  """TSADropboxRetrieval.ipynb
 
3
  Automatically generated by Colaboratory.
 
4
  Original file is located at
5
  https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
6
  """
 
8
  # !pip install dropbox -q
9
 
10
  # pip install pymupdf #==1.22.5
11
+ import os
12
  import base64
13
  import requests
14
  import json
 
21
  import re
22
  import pyarrow
23
  from io import BytesIO
24
+ from functools import lru_cache
25
+ from io import BytesIO
26
+ import os
27
+
28
+
29
+
30
+ @lru_cache(maxsize=1)
31
+ def load_parquet_df():
32
+ return GetParquetDF()
33
 
 
34
 
35
  files_list=[]
36
 
 
107
  print('Error uploading file to Dropbox: ' + str(e))
108
  return dbxTeam
109
 
110
+
111
  def check_if_file_exists(dbxTeam,path):
112
  try:
113
  md = dbxTeam.files_get_metadata(path)
 
116
  except Exception as error_response:
117
  exists_bool = False
118
  return exists_bool
119
+ def uploadmarkupPDFTable(doc,pdfname,path):
120
+ dbxTeam= ADR_Access_DropboxTeam('admin')
121
+ try:
122
+ path=path+pdfname
123
+ exists_bool=check_if_file_exists(dbxTeam,path)
124
+ if exists_bool:
125
+ print("if gowa el else <3")
126
+ dbxTeam.files_delete(path)
127
+ print("abl el meta <3")
128
+ meta=dbxTeam.files_upload(doc.read() ,path)
129
+ try:
130
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
131
+ except:
132
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
133
+ # print(shared_link_metadata.url)
134
+ return shared_link_metadata.url
135
+ except Exception as e:
136
+ print('Error uploading file to Dropbox: ' + str(e))
137
+
138
+
139
 
140
+ def upload_string_file(content_str, filename, path):
141
  try:
142
+ dbxTeam = ADR_Access_DropboxTeam('admin')
143
+ full_path = path + filename
144
+ # convert string to bytes
145
+ f = BytesIO(content_str.encode("utf-8"))
146
+ # delete if file already exists
147
+ if check_if_file_exists(dbxTeam, full_path):
148
+ dbxTeam.files_delete(full_path)
149
+ # upload
150
+ meta = dbxTeam.files_upload(f.getvalue(), full_path)
151
+ # create shared link
152
+ try:
153
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(full_path)
154
+ except:
155
+ shared_link_metadata = dbxTeam.sharing_create_shared_link(full_path)
156
+ return shared_link_metadata.url
157
+ except Exception as e:
158
+ print("Error uploading file to Dropbox:", str(e))
159
+ return "Error uploading file to Dropbox."
160
+
161
+
162
+
163
+
164
+ def uploadanyFile(doc, pdfname, path, flag=0):
165
+ try:
166
+ dbxTeam = ADR_Access_DropboxTeam('admin')
167
+
168
+ # Determine initial file path
169
+ if flag: # XML upload
170
+ pdfname = str(pdfname).split('.pdf')[0] + '.xml'
171
+ file_path = path + pdfname
172
+ f = BytesIO()
173
+ doc.write(f, encoding='utf-8', xml_declaration=True)
174
+ data_to_upload = f.getvalue()
175
  else:
176
+ file_path = path + pdfname
177
+ data_to_upload = doc.write()
178
+
179
+ # Extract base name and extension for duplicate handling
180
+ base_name, ext = os.path.splitext(pdfname)
181
+ counter = 1
182
+
183
+ # If file exists, increment suffix until unique
184
+ while check_if_file_exists(dbxTeam, file_path):
185
+ new_name = f"{base_name}({counter}){ext}"
186
+ file_path = path + new_name
187
+ counter += 1
188
+
189
+ # Upload file
190
+ meta = dbxTeam.files_upload(data_to_upload, file_path)
191
+
192
+ # Try to create or retrieve shared link
193
  try:
194
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(file_path)
 
195
  except:
196
+ shared_link_metadata = dbxTeam.sharing_create_shared_link(file_path)
197
+
198
  return shared_link_metadata.url
199
+
200
  except Exception as e:
201
  print('Error uploading file to Dropbox: ' + str(e))
202
 
203
  return 'Error uploading file to Dropbox.'
204
 
205
+
206
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
207
  # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
208
  def DropboxItemstoDF(folder_path):
 
242
  return df
243
 
244
 
245
+ def getPathtoPDF_File(nameofPDF,progress_callback=None):
246
+ parquetDf = load_parquet_df()
247
  nameofPDF=nameofPDF.replace('"', '')
248
  try:
249
+ # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
250
+ path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
251
+ if progress_callback:
252
+ progress_callback(60)
253
  link=getSharedLink(path)
254
  print(path,link)
255
  except:
 
264
  data = res.content
265
  return data
266
 
267
+ def retrieveProjects(projname, progress_callback=None):
268
+ # if progress_callback:
269
+ progress_callback(20)
270
+ projnameNospaces = projname.strip().replace('"', '').replace("'", '').replace(" ", "")
271
+ print(projname,projnameNospaces)
272
+ projname = '/' + projnameNospaces[:4] # Extract main project name
273
+ projname = projname.replace('/"', '') # Remove unwanted characters
274
+ print('projname', projname)
275
+
276
+ parquetDf = load_parquet_df()
277
+ documentsToMeasure = []
278
+ RelevantDocuments = []
279
+
280
+ # Send progress update (20%)
281
+ if progress_callback:
282
+ progress_callback(40)
283
+
284
+ # Store the original path before converting it to lowercase
285
+ parquetDf['original_path_display'] = parquetDf['path_display']
286
+
287
+ # Create a lowercase column for case-insensitive matching
288
+ parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
289
+ if progress_callback:
290
+ progress_callback(50)
291
+ ##### Updated code #######
292
+ # Filter using the lowercase column but retrieve the original paths
293
+ def path_matches(x):
294
+ #remove the file name from the path
295
+ folder_path = os.path.dirname(x)
296
+ #Check if project number (like /2564) appears in folder path
297
+ has_projnum = projname in folder_path
298
+ #Check if "01 project details" appears in folder path
299
+ has_details = '01 project details' in folder_path
300
+ return has_projnum and has_details
301
+
302
+ #Apply the mask
303
+ mask = parquetDf['path_display_lower'].apply(path_matches)
304
+ #### End of updated code #####
305
+ if progress_callback:
306
+ progress_callback(60)
307
+ # Retrieve the original (case-sensitive) paths before lowering them
308
+ RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
309
+ documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
310
+ print('documentsToMeasure',documentsToMeasure)
311
+ # Send progress update (80%)
312
+ if progress_callback:
313
+ progress_callback(70)
314
+
315
+ # Extract path from the original (case-sensitive) column
316
+ if RelevantDocuments:
317
+ extracted_path = RelevantDocuments[0][1].split("01 Project Details")[0] + "01 Project Details"
318
+ else:
319
+ extracted_path = None # Handle case when no match is found
320
+
321
+ # Remove temporary columns
322
+ parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
323
+ if progress_callback:
324
+ progress_callback(80)
325
+ return documentsToMeasure, RelevantDocuments, extracted_path
326
+