# -*- coding: utf-8 -*- """TSADropboxRetrieval.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE """ # !pip install dropbox -q # pip install pymupdf #==1.22.5 import base64 import requests import json import pathlib import pandas as pd import dropbox from dropbox.exceptions import AuthError import fitz import io import re import pyarrow from io import BytesIO from functools import lru_cache @lru_cache(maxsize=1) def load_parquet_df(): return GetParquetDF() files_list=[] app_key='9bljerefjumct38' app_secret='nl6k66clw1j1k12' access_code='sl.Bou05Rb15xPy851-I1UV8oOabHPY21AEPl5nrYl-Q0ninFSy0kTuRWPSve_JPbd3Z03E7eBY4r9R454rdzDM0AxLkyqrQEDzyAGUwP7kZ7s2CR6EwvdLD2a7Xh8nFEs38voLTH2IHzrQ2QEx7rji4OJ8aSQStKtJkI7_dh8tYHj5' refresh_token='qK2VqvbxWMMAAAAAAAAAAXFQvrHM4xUwWUcZ6l5vGOygn1iAA6zlDjmAQNBbZprL' basic_auth=base64.b64encode(f'{app_key}:{app_secret}'.encode()) def ADR_Access_DropboxTeam(flag): if flag=='user': dbxTeam = dropbox.DropboxTeam(app_key=app_key, app_secret=app_secret, oauth2_refresh_token = refresh_token).as_user('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw') elif flag=='admin': dbxTeam = dropbox.DropboxTeam(app_key=app_key, app_secret=app_secret, oauth2_refresh_token = refresh_token).as_admin('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw') root_namespace_id = dbxTeam.users_get_current_account().root_info.root_namespace_id dbxTeam = dbxTeam.with_path_root(dropbox.common.PathRoot.root(root_namespace_id)) return dbxTeam def getSharedLink(path): dbxTeam=ADR_Access_DropboxTeam('user') try: shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) except: shared_link_metadata=dbxTeam.sharing_create_shared_link(path) return shared_link_metadata.url def handle_entries(entries , files_list): for file in entries: if isinstance(file, dropbox.files.FileMetadata): # if str(file.name).endswith(".pdf"): metadata = { 'name': file.name, 'path_display': file.path_display, 'client_modified': file.client_modified, 'server_modified': file.server_modified } files_list.append(metadata) df = pd.DataFrame.from_records(files_list) return df def dropbox_connect(): """Create a connection to myyyyyyy Dropbox.""" print('connecy') try: # print('ayhaga') dbxMe = dropbox.Dropbox( app_key='67w6ibpa9d2b60x', app_secret='d3ecz8g1604fu04', oauth2_refresh_token = 'R_LACBBNhysAAAAAAAAAAXt9mMy9OYIV_v4pF45lG6Z8DHNV66rq1q7acWjj_H5g', ) # dbx=dropbox.Dropbox(access_token) except AuthError as e: print('Error connecting to Dropbox with access token: ' + str(e)) return dbxMe def dropbox_upload_file(df, flag=0): try: dbxTeam= ADR_Access_DropboxTeam('admin') path='/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip' doc=df.to_parquet() dbxTeam.files_delete(path) meta=dbxTeam.files_upload(doc,path) except Exception as e: print('Error uploading file to Dropbox: ' + str(e)) return dbxTeam def check_if_file_exists(dbxTeam,path): try: md = dbxTeam.files_get_metadata(path) exists_bool = True return exists_bool except Exception as error_response: exists_bool = False return exists_bool def uploadmarkupPDFTable(doc,pdfname,path): dbxTeam= ADR_Access_DropboxTeam('admin') try: path=path+pdfname exists_bool=check_if_file_exists(dbxTeam,path) if exists_bool: print("if gowa el else <3") dbxTeam.files_delete(path) print("abl el meta <3") meta=dbxTeam.files_upload(doc.read() ,path) try: shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) except: shared_link_metadata=dbxTeam.sharing_create_shared_link(path) # print(shared_link_metadata.url) return shared_link_metadata.url except Exception as e: print('Error uploading file to Dropbox: ' + str(e)) def uploadanyFile(doc,pdfname,path,flag=0): try: dbxTeam= ADR_Access_DropboxTeam('admin') print('ppp') if flag: #tree = doc pdfname=str(pdfname).split('.pdf')[0]+'.xml' path=path+pdfname print(path) f = BytesIO() doc.write(f, encoding='utf-8', xml_declaration=True) exists_bool=check_if_file_exists(dbxTeam,path) if exists_bool: dbxTeam.files_delete(path) meta=dbxTeam.files_upload(f.getvalue() ,path) else: path=path+pdfname exists_bool=check_if_file_exists(dbxTeam,path) if exists_bool: dbxTeam.files_delete(path) meta=dbxTeam.files_upload(doc.write() ,path) try: shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) except: shared_link_metadata=dbxTeam.sharing_create_shared_link(path) # print(shared_link_metadata.url) return shared_link_metadata.url except Exception as e: print('Error uploading file to Dropbox: ' + str(e)) return 'Error uploading file to Dropbox.' # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items def DropboxItemstoDF(folder_path): files_list=[] dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters # folder_path = "/TSA JOBS" res = dbxTeam.files_list_folder(path=folder_path, recursive=True ) # df1=handle_entries(res.entries , files_list) if res.has_more: while res.has_more: res = dbxTeam.files_list_folder_continue(cursor=res.cursor) df2=handle_entries(res.entries , files_list) # dbxTeam=dropbox_upload_file(df2) # print(df2) return df2 , files_list def GetParquetDF(): # Initialize Dropbox client dbxTeam = ADR_Access_DropboxTeam('user') # or pass dbx in parameters # Define the path to the Parquet file on Dropbox path = '/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip' try: # Try to create a shared link with settings shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path=path) except dropbox.exceptions.ApiError: # If settings are not supported, create a shared link without settings shared_link_metadata = dbxTeam.sharing_create_shared_link(path=path) # Get the file content from the shared link _, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url) data = res.content # Read the Parquet file content into a pandas DataFrame with io.BytesIO(data) as pq_file: df = pd.read_parquet(pq_file) return df def getPathtoPDF_File(nameofPDF,progress_callback=None): parquetDf = load_parquet_df() nameofPDF=nameofPDF.replace('"', '') try: # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0] path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display'] if progress_callback: progress_callback(60) link=getSharedLink(path) print(path,link) except: return 'Project does not exist' return path,link # parquetDf # getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf') def getPDFData(path): dbxTeam= ADR_Access_DropboxTeam('admin') md, res =dbxTeam.files_download(path) data = res.content return data def retrieveProjects(projname, progress_callback=None): # if progress_callback: progress_callback(20) projname = '/' + projname.split(' ')[0] # Extract main project name projname = projname.replace('/"', '') # Remove unwanted characters print('projname', projname) parquetDf = load_parquet_df() documentsToMeasure = [] RelevantDocuments = [] # Send progress update (20%) if progress_callback: progress_callback(40) # Store the original path before converting it to lowercase parquetDf['original_path_display'] = parquetDf['path_display'] # Create a lowercase column for case-insensitive matching parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower() if progress_callback: progress_callback(50) # Filter using the lowercase column but retrieve the original paths mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x) if progress_callback: progress_callback(60) # Retrieve the original (case-sensitive) paths before lowering them RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist() documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs # Send progress update (80%) if progress_callback: progress_callback(70) # Extract path from the original (case-sensitive) column if RelevantDocuments: extracted_path = RelevantDocuments[0][1].split("01 Project Details")[0] + "01 Project Details" else: extracted_path = None # Handle case when no match is found # Remove temporary columns parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True) if progress_callback: progress_callback(80) return documentsToMeasure, RelevantDocuments, extracted_path