Spaces:
Runtime error
Runtime error
Update tsadropboxretrieval.py
Browse files- tsadropboxretrieval.py +153 -55
tsadropboxretrieval.py
CHANGED
|
@@ -1,8 +1,6 @@
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""TSADropboxRetrieval.ipynb
|
| 3 |
-
|
| 4 |
Automatically generated by Colaboratory.
|
| 5 |
-
|
| 6 |
Original file is located at
|
| 7 |
https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
|
| 8 |
"""
|
|
@@ -10,7 +8,7 @@ Original file is located at
|
|
| 10 |
# !pip install dropbox -q
|
| 11 |
|
| 12 |
# pip install pymupdf #==1.22.5
|
| 13 |
-
|
| 14 |
import base64
|
| 15 |
import requests
|
| 16 |
import json
|
|
@@ -23,8 +21,16 @@ import io
|
|
| 23 |
import re
|
| 24 |
import pyarrow
|
| 25 |
from io import BytesIO
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
-
"""### NEW CODE - OCTOBER 26 - Marthe"""
|
| 28 |
|
| 29 |
files_list=[]
|
| 30 |
|
|
@@ -101,6 +107,7 @@ def dropbox_upload_file(df, flag=0):
|
|
| 101 |
print('Error uploading file to Dropbox: ' + str(e))
|
| 102 |
return dbxTeam
|
| 103 |
|
|
|
|
| 104 |
def check_if_file_exists(dbxTeam,path):
|
| 105 |
try:
|
| 106 |
md = dbxTeam.files_get_metadata(path)
|
|
@@ -109,39 +116,93 @@ def check_if_file_exists(dbxTeam,path):
|
|
| 109 |
except Exception as error_response:
|
| 110 |
exists_bool = False
|
| 111 |
return exists_bool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
-
def
|
| 114 |
try:
|
| 115 |
-
dbxTeam= ADR_Access_DropboxTeam('admin')
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
else:
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
try:
|
| 134 |
-
|
| 135 |
-
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
|
| 136 |
except:
|
| 137 |
-
|
| 138 |
-
|
| 139 |
return shared_link_metadata.url
|
|
|
|
| 140 |
except Exception as e:
|
| 141 |
print('Error uploading file to Dropbox: ' + str(e))
|
| 142 |
|
| 143 |
return 'Error uploading file to Dropbox.'
|
| 144 |
|
|
|
|
| 145 |
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 146 |
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 147 |
def DropboxItemstoDF(folder_path):
|
|
@@ -181,11 +242,14 @@ def GetParquetDF():
|
|
| 181 |
return df
|
| 182 |
|
| 183 |
|
| 184 |
-
def getPathtoPDF_File(nameofPDF):
|
| 185 |
-
parquetDf=
|
| 186 |
nameofPDF=nameofPDF.replace('"', '')
|
| 187 |
try:
|
| 188 |
-
path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
|
|
|
|
|
|
|
|
|
|
| 189 |
link=getSharedLink(path)
|
| 190 |
print(path,link)
|
| 191 |
except:
|
|
@@ -200,29 +264,63 @@ def getPDFData(path):
|
|
| 200 |
data = res.content
|
| 201 |
return data
|
| 202 |
|
| 203 |
-
def retrieveProjects(projname):
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# -*- coding: utf-8 -*-
|
| 2 |
"""TSADropboxRetrieval.ipynb
|
|
|
|
| 3 |
Automatically generated by Colaboratory.
|
|
|
|
| 4 |
Original file is located at
|
| 5 |
https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
|
| 6 |
"""
|
|
|
|
| 8 |
# !pip install dropbox -q
|
| 9 |
|
| 10 |
# pip install pymupdf #==1.22.5
|
| 11 |
+
import os
|
| 12 |
import base64
|
| 13 |
import requests
|
| 14 |
import json
|
|
|
|
| 21 |
import re
|
| 22 |
import pyarrow
|
| 23 |
from io import BytesIO
|
| 24 |
+
from functools import lru_cache
|
| 25 |
+
from io import BytesIO
|
| 26 |
+
import os
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
@lru_cache(maxsize=1)
|
| 31 |
+
def load_parquet_df():
|
| 32 |
+
return GetParquetDF()
|
| 33 |
|
|
|
|
| 34 |
|
| 35 |
files_list=[]
|
| 36 |
|
|
|
|
| 107 |
print('Error uploading file to Dropbox: ' + str(e))
|
| 108 |
return dbxTeam
|
| 109 |
|
| 110 |
+
|
| 111 |
def check_if_file_exists(dbxTeam,path):
|
| 112 |
try:
|
| 113 |
md = dbxTeam.files_get_metadata(path)
|
|
|
|
| 116 |
except Exception as error_response:
|
| 117 |
exists_bool = False
|
| 118 |
return exists_bool
|
| 119 |
+
def uploadmarkupPDFTable(doc,pdfname,path):
|
| 120 |
+
dbxTeam= ADR_Access_DropboxTeam('admin')
|
| 121 |
+
try:
|
| 122 |
+
path=path+pdfname
|
| 123 |
+
exists_bool=check_if_file_exists(dbxTeam,path)
|
| 124 |
+
if exists_bool:
|
| 125 |
+
print("if gowa el else <3")
|
| 126 |
+
dbxTeam.files_delete(path)
|
| 127 |
+
print("abl el meta <3")
|
| 128 |
+
meta=dbxTeam.files_upload(doc.read() ,path)
|
| 129 |
+
try:
|
| 130 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
|
| 131 |
+
except:
|
| 132 |
+
shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
|
| 133 |
+
# print(shared_link_metadata.url)
|
| 134 |
+
return shared_link_metadata.url
|
| 135 |
+
except Exception as e:
|
| 136 |
+
print('Error uploading file to Dropbox: ' + str(e))
|
| 137 |
+
|
| 138 |
+
|
| 139 |
|
| 140 |
+
def upload_string_file(content_str, filename, path):
|
| 141 |
try:
|
| 142 |
+
dbxTeam = ADR_Access_DropboxTeam('admin')
|
| 143 |
+
full_path = path + filename
|
| 144 |
+
# convert string to bytes
|
| 145 |
+
f = BytesIO(content_str.encode("utf-8"))
|
| 146 |
+
# delete if file already exists
|
| 147 |
+
if check_if_file_exists(dbxTeam, full_path):
|
| 148 |
+
dbxTeam.files_delete(full_path)
|
| 149 |
+
# upload
|
| 150 |
+
meta = dbxTeam.files_upload(f.getvalue(), full_path)
|
| 151 |
+
# create shared link
|
| 152 |
+
try:
|
| 153 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(full_path)
|
| 154 |
+
except:
|
| 155 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link(full_path)
|
| 156 |
+
return shared_link_metadata.url
|
| 157 |
+
except Exception as e:
|
| 158 |
+
print("Error uploading file to Dropbox:", str(e))
|
| 159 |
+
return "Error uploading file to Dropbox."
|
| 160 |
+
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
|
| 164 |
+
def uploadanyFile(doc, pdfname, path, flag=0):
|
| 165 |
+
try:
|
| 166 |
+
dbxTeam = ADR_Access_DropboxTeam('admin')
|
| 167 |
+
|
| 168 |
+
# Determine initial file path
|
| 169 |
+
if flag: # XML upload
|
| 170 |
+
pdfname = str(pdfname).split('.pdf')[0] + '.xml'
|
| 171 |
+
file_path = path + pdfname
|
| 172 |
+
f = BytesIO()
|
| 173 |
+
doc.write(f, encoding='utf-8', xml_declaration=True)
|
| 174 |
+
data_to_upload = f.getvalue()
|
| 175 |
else:
|
| 176 |
+
file_path = path + pdfname
|
| 177 |
+
data_to_upload = doc.write()
|
| 178 |
+
|
| 179 |
+
# Extract base name and extension for duplicate handling
|
| 180 |
+
base_name, ext = os.path.splitext(pdfname)
|
| 181 |
+
counter = 1
|
| 182 |
+
|
| 183 |
+
# If file exists, increment suffix until unique
|
| 184 |
+
while check_if_file_exists(dbxTeam, file_path):
|
| 185 |
+
new_name = f"{base_name}({counter}){ext}"
|
| 186 |
+
file_path = path + new_name
|
| 187 |
+
counter += 1
|
| 188 |
+
|
| 189 |
+
# Upload file
|
| 190 |
+
meta = dbxTeam.files_upload(data_to_upload, file_path)
|
| 191 |
+
|
| 192 |
+
# Try to create or retrieve shared link
|
| 193 |
try:
|
| 194 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(file_path)
|
|
|
|
| 195 |
except:
|
| 196 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link(file_path)
|
| 197 |
+
|
| 198 |
return shared_link_metadata.url
|
| 199 |
+
|
| 200 |
except Exception as e:
|
| 201 |
print('Error uploading file to Dropbox: ' + str(e))
|
| 202 |
|
| 203 |
return 'Error uploading file to Dropbox.'
|
| 204 |
|
| 205 |
+
|
| 206 |
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 207 |
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 208 |
def DropboxItemstoDF(folder_path):
|
|
|
|
| 242 |
return df
|
| 243 |
|
| 244 |
|
| 245 |
+
def getPathtoPDF_File(nameofPDF,progress_callback=None):
|
| 246 |
+
parquetDf = load_parquet_df()
|
| 247 |
nameofPDF=nameofPDF.replace('"', '')
|
| 248 |
try:
|
| 249 |
+
# path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
|
| 250 |
+
path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display']
|
| 251 |
+
if progress_callback:
|
| 252 |
+
progress_callback(60)
|
| 253 |
link=getSharedLink(path)
|
| 254 |
print(path,link)
|
| 255 |
except:
|
|
|
|
| 264 |
data = res.content
|
| 265 |
return data
|
| 266 |
|
| 267 |
+
def retrieveProjects(projname, progress_callback=None):
|
| 268 |
+
# if progress_callback:
|
| 269 |
+
progress_callback(20)
|
| 270 |
+
projnameNospaces = projname.strip().replace('"', '').replace("'", '').replace(" ", "")
|
| 271 |
+
print(projname,projnameNospaces)
|
| 272 |
+
projname = '/' + projnameNospaces[:4] # Extract main project name
|
| 273 |
+
projname = projname.replace('/"', '') # Remove unwanted characters
|
| 274 |
+
print('projname', projname)
|
| 275 |
+
|
| 276 |
+
parquetDf = load_parquet_df()
|
| 277 |
+
documentsToMeasure = []
|
| 278 |
+
RelevantDocuments = []
|
| 279 |
+
|
| 280 |
+
# Send progress update (20%)
|
| 281 |
+
if progress_callback:
|
| 282 |
+
progress_callback(40)
|
| 283 |
+
|
| 284 |
+
# Store the original path before converting it to lowercase
|
| 285 |
+
parquetDf['original_path_display'] = parquetDf['path_display']
|
| 286 |
+
|
| 287 |
+
# Create a lowercase column for case-insensitive matching
|
| 288 |
+
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 289 |
+
if progress_callback:
|
| 290 |
+
progress_callback(50)
|
| 291 |
+
##### Updated code #######
|
| 292 |
+
# Filter using the lowercase column but retrieve the original paths
|
| 293 |
+
def path_matches(x):
|
| 294 |
+
#remove the file name from the path
|
| 295 |
+
folder_path = os.path.dirname(x)
|
| 296 |
+
#Check if project number (like /2564) appears in folder path
|
| 297 |
+
has_projnum = projname in folder_path
|
| 298 |
+
#Check if "01 project details" appears in folder path
|
| 299 |
+
has_details = '01 project details' in folder_path
|
| 300 |
+
return has_projnum and has_details
|
| 301 |
+
|
| 302 |
+
#Apply the mask
|
| 303 |
+
mask = parquetDf['path_display_lower'].apply(path_matches)
|
| 304 |
+
#### End of updated code #####
|
| 305 |
+
if progress_callback:
|
| 306 |
+
progress_callback(60)
|
| 307 |
+
# Retrieve the original (case-sensitive) paths before lowering them
|
| 308 |
+
RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist()
|
| 309 |
+
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs
|
| 310 |
+
print('documentsToMeasure',documentsToMeasure)
|
| 311 |
+
# Send progress update (80%)
|
| 312 |
+
if progress_callback:
|
| 313 |
+
progress_callback(70)
|
| 314 |
+
|
| 315 |
+
# Extract path from the original (case-sensitive) column
|
| 316 |
+
if RelevantDocuments:
|
| 317 |
+
extracted_path = RelevantDocuments[0][1].split("01 Project Details")[0] + "01 Project Details"
|
| 318 |
+
else:
|
| 319 |
+
extracted_path = None # Handle case when no match is found
|
| 320 |
+
|
| 321 |
+
# Remove temporary columns
|
| 322 |
+
parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True)
|
| 323 |
+
if progress_callback:
|
| 324 |
+
progress_callback(80)
|
| 325 |
+
return documentsToMeasure, RelevantDocuments, extracted_path
|
| 326 |
+
|