Spaces:
Sleeping
Sleeping
| # -*- coding: utf-8 -*- | |
| """TSADropboxRetrieval.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE | |
| """ | |
| # !pip install dropbox -q | |
| # pip install pymupdf #==1.22.5 | |
| import base64 | |
| import requests | |
| import json | |
| import pathlib | |
| import pandas as pd | |
| import dropbox | |
| from dropbox.exceptions import AuthError | |
| import fitz | |
| import io | |
| import re | |
| import pyarrow | |
| from io import BytesIO | |
| from functools import lru_cache | |
| def load_parquet_df(): | |
| return GetParquetDF() | |
| files_list=[] | |
| app_key='9bljerefjumct38' | |
| app_secret='nl6k66clw1j1k12' | |
| access_code='sl.Bou05Rb15xPy851-I1UV8oOabHPY21AEPl5nrYl-Q0ninFSy0kTuRWPSve_JPbd3Z03E7eBY4r9R454rdzDM0AxLkyqrQEDzyAGUwP7kZ7s2CR6EwvdLD2a7Xh8nFEs38voLTH2IHzrQ2QEx7rji4OJ8aSQStKtJkI7_dh8tYHj5' | |
| refresh_token='qK2VqvbxWMMAAAAAAAAAAXFQvrHM4xUwWUcZ6l5vGOygn1iAA6zlDjmAQNBbZprL' | |
| basic_auth=base64.b64encode(f'{app_key}:{app_secret}'.encode()) | |
| def ADR_Access_DropboxTeam(flag): | |
| if flag=='user': | |
| dbxTeam = dropbox.DropboxTeam(app_key=app_key, | |
| app_secret=app_secret, | |
| oauth2_refresh_token = refresh_token).as_user('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw') | |
| elif flag=='admin': | |
| dbxTeam = dropbox.DropboxTeam(app_key=app_key, | |
| app_secret=app_secret, | |
| oauth2_refresh_token = refresh_token).as_admin('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw') | |
| root_namespace_id = dbxTeam.users_get_current_account().root_info.root_namespace_id | |
| dbxTeam = dbxTeam.with_path_root(dropbox.common.PathRoot.root(root_namespace_id)) | |
| return dbxTeam | |
| def getSharedLink(path): | |
| dbxTeam=ADR_Access_DropboxTeam('user') | |
| try: | |
| shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) | |
| except: | |
| shared_link_metadata=dbxTeam.sharing_create_shared_link(path) | |
| return shared_link_metadata.url | |
| def handle_entries(entries , files_list): | |
| for file in entries: | |
| if isinstance(file, dropbox.files.FileMetadata): | |
| # if str(file.name).endswith(".pdf"): | |
| metadata = { | |
| 'name': file.name, | |
| 'path_display': file.path_display, | |
| 'client_modified': file.client_modified, | |
| 'server_modified': file.server_modified | |
| } | |
| files_list.append(metadata) | |
| df = pd.DataFrame.from_records(files_list) | |
| return df | |
| def dropbox_connect(): | |
| """Create a connection to myyyyyyy Dropbox.""" | |
| print('connecy') | |
| try: | |
| # print('ayhaga') | |
| dbxMe = dropbox.Dropbox( | |
| app_key='67w6ibpa9d2b60x', | |
| app_secret='d3ecz8g1604fu04', | |
| oauth2_refresh_token = 'R_LACBBNhysAAAAAAAAAAXt9mMy9OYIV_v4pF45lG6Z8DHNV66rq1q7acWjj_H5g', | |
| ) | |
| # dbx=dropbox.Dropbox(access_token) | |
| except AuthError as e: | |
| print('Error connecting to Dropbox with access token: ' + str(e)) | |
| return dbxMe | |
| def dropbox_upload_file(df, flag=0): | |
| try: | |
| dbxTeam= ADR_Access_DropboxTeam('admin') | |
| path='/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip' | |
| doc=df.to_parquet() | |
| dbxTeam.files_delete(path) | |
| meta=dbxTeam.files_upload(doc,path) | |
| except Exception as e: | |
| print('Error uploading file to Dropbox: ' + str(e)) | |
| return dbxTeam | |
| def check_if_file_exists(dbxTeam,path): | |
| try: | |
| md = dbxTeam.files_get_metadata(path) | |
| exists_bool = True | |
| return exists_bool | |
| except Exception as error_response: | |
| exists_bool = False | |
| return exists_bool | |
| def uploadmarkupPDFTable(doc,pdfname,path): | |
| dbxTeam= ADR_Access_DropboxTeam('admin') | |
| try: | |
| path=path+pdfname | |
| exists_bool=check_if_file_exists(dbxTeam,path) | |
| if exists_bool: | |
| print("if gowa el else <3") | |
| dbxTeam.files_delete(path) | |
| print("abl el meta <3") | |
| meta=dbxTeam.files_upload(doc.read() ,path) | |
| try: | |
| shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) | |
| except: | |
| shared_link_metadata=dbxTeam.sharing_create_shared_link(path) | |
| # print(shared_link_metadata.url) | |
| return shared_link_metadata.url | |
| except Exception as e: | |
| print('Error uploading file to Dropbox: ' + str(e)) | |
| def uploadanyFile(doc,pdfname,path,flag=0): | |
| try: | |
| dbxTeam= ADR_Access_DropboxTeam('admin') | |
| print('ppp') | |
| if flag: #tree = doc | |
| pdfname=str(pdfname).split('.pdf')[0]+'.xml' | |
| path=path+pdfname | |
| print(path) | |
| f = BytesIO() | |
| doc.write(f, encoding='utf-8', xml_declaration=True) | |
| exists_bool=check_if_file_exists(dbxTeam,path) | |
| if exists_bool: | |
| dbxTeam.files_delete(path) | |
| meta=dbxTeam.files_upload(f.getvalue() ,path) | |
| else: | |
| path=path+pdfname | |
| exists_bool=check_if_file_exists(dbxTeam,path) | |
| if exists_bool: | |
| dbxTeam.files_delete(path) | |
| meta=dbxTeam.files_upload(doc.write() ,path) | |
| try: | |
| shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path) | |
| except: | |
| shared_link_metadata=dbxTeam.sharing_create_shared_link(path) | |
| # print(shared_link_metadata.url) | |
| return shared_link_metadata.url | |
| except Exception as e: | |
| print('Error uploading file to Dropbox: ' + str(e)) | |
| return 'Error uploading file to Dropbox.' | |
| # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items | |
| # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items | |
| def DropboxItemstoDF(folder_path): | |
| files_list=[] | |
| dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters | |
| # folder_path = "/TSA JOBS" | |
| res = dbxTeam.files_list_folder(path=folder_path, recursive=True ) | |
| # df1=handle_entries(res.entries , files_list) | |
| if res.has_more: | |
| while res.has_more: | |
| res = dbxTeam.files_list_folder_continue(cursor=res.cursor) | |
| df2=handle_entries(res.entries , files_list) | |
| # dbxTeam=dropbox_upload_file(df2) | |
| # print(df2) | |
| return df2 , files_list | |
| def GetParquetDF(): | |
| # Initialize Dropbox client | |
| dbxTeam = ADR_Access_DropboxTeam('user') # or pass dbx in parameters | |
| # Define the path to the Parquet file on Dropbox | |
| path = '/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip' | |
| try: | |
| # Try to create a shared link with settings | |
| shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path=path) | |
| except dropbox.exceptions.ApiError: | |
| # If settings are not supported, create a shared link without settings | |
| shared_link_metadata = dbxTeam.sharing_create_shared_link(path=path) | |
| # Get the file content from the shared link | |
| _, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url) | |
| data = res.content | |
| # Read the Parquet file content into a pandas DataFrame | |
| with io.BytesIO(data) as pq_file: | |
| df = pd.read_parquet(pq_file) | |
| return df | |
| def getPathtoPDF_File(nameofPDF,progress_callback=None): | |
| parquetDf = load_parquet_df() | |
| nameofPDF=nameofPDF.replace('"', '') | |
| try: | |
| # path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0] | |
| path = parquetDf.at[parquetDf.index[parquetDf['name'] == nameofPDF][0], 'path_display'] | |
| if progress_callback: | |
| progress_callback(60) | |
| link=getSharedLink(path) | |
| print(path,link) | |
| except: | |
| return 'Project does not exist' | |
| return path,link | |
| # parquetDf | |
| # getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf') | |
| def getPDFData(path): | |
| dbxTeam= ADR_Access_DropboxTeam('admin') | |
| md, res =dbxTeam.files_download(path) | |
| data = res.content | |
| return data | |
| def retrieveProjects(projname, progress_callback=None): | |
| # if progress_callback: | |
| progress_callback(20) | |
| projname = '/' + projname.split(' ')[0] # Extract main project name | |
| projname = projname.replace('/"', '') # Remove unwanted characters | |
| print('projname', projname) | |
| parquetDf = load_parquet_df() | |
| documentsToMeasure = [] | |
| RelevantDocuments = [] | |
| # Send progress update (20%) | |
| if progress_callback: | |
| progress_callback(40) | |
| # Store the original path before converting it to lowercase | |
| parquetDf['original_path_display'] = parquetDf['path_display'] | |
| # Create a lowercase column for case-insensitive matching | |
| parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower() | |
| if progress_callback: | |
| progress_callback(50) | |
| # Filter using the lowercase column but retrieve the original paths | |
| mask = parquetDf['path_display_lower'].apply(lambda x: '/'+projname in x and '01 project details' in x) | |
| if progress_callback: | |
| progress_callback(60) | |
| # Retrieve the original (case-sensitive) paths before lowering them | |
| RelevantDocuments = parquetDf[mask][['name', 'original_path_display']].values.tolist() | |
| documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Keep only PDFs | |
| # Send progress update (80%) | |
| if progress_callback: | |
| progress_callback(70) | |
| # Extract path from the original (case-sensitive) column | |
| if RelevantDocuments: | |
| extracted_path = RelevantDocuments[0][1].split("01 Project Details")[0] + "01 Project Details" | |
| else: | |
| extracted_path = None # Handle case when no match is found | |
| # Remove temporary columns | |
| parquetDf.drop(columns=['original_path_display', 'path_display_lower'], inplace=True) | |
| if progress_callback: | |
| progress_callback(80) | |
| return documentsToMeasure, RelevantDocuments, extracted_path | |