Marthee commited on
Commit
43ed14c
·
verified ·
1 Parent(s): 667daa3

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +46 -0
  2. pdftotext.py +11 -0
  3. requirements.txt +7 -0
  4. tsadropboxretrieval.py +228 -0
app.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, abort
2
+ import tsadropboxretrieval
3
+ import pdftotext
4
+
5
+ app = Flask(__name__)
6
+ # API_KEY = "adrpdftotext" # Replace with your actual API key
7
+
8
+ # def check_api_key():
9
+ # api_key = request.headers.get("x-api-key")
10
+ # if api_key != API_KEY:
11
+ # abort(403) # Forbidden if API key is missing or incorrect
12
+
13
+ @app.route('/process', methods=['POST'])
14
+ def process():
15
+ # check_api_key()
16
+ try:
17
+ print('In process')
18
+ data = request.get_json() # Correct method to get JSON data
19
+ print(data)
20
+
21
+ # Ensure 'pdfpath' is included in the request
22
+ if 'filePath' not in data:
23
+ return jsonify({"error": "Missing 'pdfpath' in request data"}), 400
24
+
25
+ pdfpath = data['filePath']
26
+
27
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
28
+ md, res = dbxTeam.files_download(path=pdfpath)
29
+ pdf_data = res.content
30
+
31
+ # Ensure 'pdftotext.texts_from_pdf' is a valid function
32
+ pdftext = pdftotext.texts_from_pdf(pdf_data)
33
+
34
+ # Prepare response
35
+ response_data = {
36
+ "message": "Data received",
37
+ "input_data": pdftext
38
+ }
39
+ return jsonify(response_data)
40
+
41
+ except Exception as e:
42
+ print(f"Error: {e}")
43
+ return jsonify({"error": str(e)}), 500
44
+
45
+ if __name__ == '__main__':
46
+ app.run(host='0.0.0.0', port=7860)
pdftotext.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fitz
2
+
3
+ def texts_from_pdf(input_pdf_data):
4
+ pdf_document = fitz.open('pdf',input_pdf_data)
5
+
6
+ for page_num in range(pdf_document.page_count):
7
+ page = pdf_document[page_num]
8
+ text_instances = page.get_text()
9
+
10
+ print(text_instances)
11
+ return text_instances
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ datasets==2.*
2
+ flask==3.0.0.*
3
+ requests==2.27.*
4
+ sentencepiece==0.1.*
5
+ pymupdf==1.22.5
6
+ dropbox
7
+ numpy<2
tsadropboxretrieval.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ """TSADropboxRetrieval.ipynb
3
+
4
+ Automatically generated by Colaboratory.
5
+
6
+ Original file is located at
7
+ https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
8
+ """
9
+
10
+ # !pip install dropbox -q
11
+
12
+ # pip install pymupdf #==1.22.5
13
+
14
+ import base64
15
+ import requests
16
+ import json
17
+ import pathlib
18
+ import pandas as pd
19
+ import dropbox
20
+ from dropbox.exceptions import AuthError
21
+ import fitz
22
+ import io
23
+ import re
24
+ import pyarrow
25
+ from io import BytesIO
26
+
27
+ """### NEW CODE - OCTOBER 26 - Marthe"""
28
+
29
+ files_list=[]
30
+
31
+ app_key='9bljerefjumct38'
32
+ app_secret='nl6k66clw1j1k12'
33
+ access_code='sl.Bou05Rb15xPy851-I1UV8oOabHPY21AEPl5nrYl-Q0ninFSy0kTuRWPSve_JPbd3Z03E7eBY4r9R454rdzDM0AxLkyqrQEDzyAGUwP7kZ7s2CR6EwvdLD2a7Xh8nFEs38voLTH2IHzrQ2QEx7rji4OJ8aSQStKtJkI7_dh8tYHj5'
34
+ refresh_token='qK2VqvbxWMMAAAAAAAAAAXFQvrHM4xUwWUcZ6l5vGOygn1iAA6zlDjmAQNBbZprL'
35
+ basic_auth=base64.b64encode(f'{app_key}:{app_secret}'.encode())
36
+
37
+ def ADR_Access_DropboxTeam(flag):
38
+ if flag=='user':
39
+ dbxTeam = dropbox.DropboxTeam(app_key=app_key,
40
+ app_secret=app_secret,
41
+ oauth2_refresh_token = refresh_token).as_user('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
42
+ elif flag=='admin':
43
+ dbxTeam = dropbox.DropboxTeam(app_key=app_key,
44
+ app_secret=app_secret,
45
+ oauth2_refresh_token = refresh_token).as_admin('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
46
+ root_namespace_id = dbxTeam.users_get_current_account().root_info.root_namespace_id
47
+ dbxTeam = dbxTeam.with_path_root(dropbox.common.PathRoot.root(root_namespace_id))
48
+ return dbxTeam
49
+
50
+ def getSharedLink(path):
51
+ dbxTeam=ADR_Access_DropboxTeam('user')
52
+ try:
53
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
54
+ except:
55
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
56
+ return shared_link_metadata.url
57
+
58
+ def handle_entries(entries , files_list):
59
+
60
+ for file in entries:
61
+ if isinstance(file, dropbox.files.FileMetadata):
62
+ # if str(file.name).endswith(".pdf"):
63
+ metadata = {
64
+ 'name': file.name,
65
+ 'path_display': file.path_display,
66
+ 'client_modified': file.client_modified,
67
+ 'server_modified': file.server_modified
68
+ }
69
+
70
+ files_list.append(metadata)
71
+ df = pd.DataFrame.from_records(files_list)
72
+ return df
73
+
74
+ def dropbox_connect():
75
+ """Create a connection to myyyyyyy Dropbox."""
76
+ print('connecy')
77
+ try:
78
+ # print('ayhaga')
79
+ dbxMe = dropbox.Dropbox(
80
+ app_key='67w6ibpa9d2b60x',
81
+ app_secret='d3ecz8g1604fu04',
82
+ oauth2_refresh_token = 'R_LACBBNhysAAAAAAAAAAXt9mMy9OYIV_v4pF45lG6Z8DHNV66rq1q7acWjj_H5g',
83
+
84
+ )
85
+ # dbx=dropbox.Dropbox(access_token)
86
+
87
+ except AuthError as e:
88
+ print('Error connecting to Dropbox with access token: ' + str(e))
89
+ return dbxMe
90
+
91
+
92
+ def dropbox_upload_file(df, flag=0):
93
+ try:
94
+ dbxTeam= ADR_Access_DropboxTeam('admin')
95
+ path='/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip'
96
+ doc=df.to_parquet()
97
+ dbxTeam.files_delete(path)
98
+ meta=dbxTeam.files_upload(doc,path)
99
+
100
+ except Exception as e:
101
+ print('Error uploading file to Dropbox: ' + str(e))
102
+ return dbxTeam
103
+
104
+ def check_if_file_exists(dbxTeam,path):
105
+ try:
106
+ md = dbxTeam.files_get_metadata(path)
107
+ exists_bool = True
108
+ return exists_bool
109
+ except Exception as error_response:
110
+ exists_bool = False
111
+ return exists_bool
112
+
113
+ def uploadanyFile(doc,pdfname,path,flag=0):
114
+ try:
115
+ dbxTeam= ADR_Access_DropboxTeam('admin')
116
+ print('ppp')
117
+ if flag: #tree = doc
118
+ pdfname=str(pdfname).split('.pdf')[0]+'.xml'
119
+ path=path+pdfname
120
+ print(path)
121
+ f = BytesIO()
122
+ doc.write(f, encoding='utf-8', xml_declaration=True)
123
+ exists_bool=check_if_file_exists(dbxTeam,path)
124
+ if exists_bool:
125
+ dbxTeam.files_delete(path)
126
+ meta=dbxTeam.files_upload(f.getvalue() ,path)
127
+ else:
128
+ path=path+pdfname
129
+ exists_bool=check_if_file_exists(dbxTeam,path)
130
+ if exists_bool:
131
+ dbxTeam.files_delete(path)
132
+ meta=dbxTeam.files_upload(doc.write() ,path)
133
+ try:
134
+ print('hereintry')
135
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
136
+ except:
137
+ print('hereinexcept')
138
+ shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
139
+ return shared_link_metadata.url
140
+ except Exception as e:
141
+ print('Error uploading file to Dropbox: ' + str(e))
142
+
143
+ return 'Error uploading file to Dropbox.'
144
+
145
+ # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
146
+ # Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
147
+ def DropboxItemstoDF(folder_path):
148
+ files_list=[]
149
+ dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
150
+ # folder_path = "/TSA JOBS"
151
+ res = dbxTeam.files_list_folder(path=folder_path, recursive=True )
152
+ # df1=handle_entries(res.entries , files_list)
153
+ if res.has_more:
154
+ while res.has_more:
155
+ res = dbxTeam.files_list_folder_continue(cursor=res.cursor)
156
+ df2=handle_entries(res.entries , files_list)
157
+
158
+ # dbxTeam=dropbox_upload_file(df2)
159
+ # print(df2)
160
+ return df2 , files_list
161
+
162
+
163
+ def GetParquetDF():
164
+ # Initialize Dropbox client
165
+ dbxTeam = ADR_Access_DropboxTeam('user') # or pass dbx in parameters
166
+ # Define the path to the Parquet file on Dropbox
167
+ path = '/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip'
168
+ try:
169
+ # Try to create a shared link with settings
170
+ shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path=path)
171
+ except dropbox.exceptions.ApiError:
172
+ # If settings are not supported, create a shared link without settings
173
+ shared_link_metadata = dbxTeam.sharing_create_shared_link(path=path)
174
+ # Get the file content from the shared link
175
+ _, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
176
+ data = res.content
177
+ # Read the Parquet file content into a pandas DataFrame
178
+ with io.BytesIO(data) as pq_file:
179
+ df = pd.read_parquet(pq_file)
180
+
181
+ return df
182
+
183
+
184
+ def getPathtoPDF_File(nameofPDF):
185
+ parquetDf=GetParquetDF()
186
+ nameofPDF=nameofPDF.replace('"', '')
187
+ try:
188
+ path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
189
+ link=getSharedLink(path)
190
+ print(path,link)
191
+ except:
192
+ return 'Project does not exist'
193
+ return path,link
194
+ # parquetDf
195
+
196
+ # getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf')
197
+ def getPDFData(path):
198
+ dbxTeam= ADR_Access_DropboxTeam('admin')
199
+ md, res =dbxTeam.files_download(path)
200
+ data = res.content
201
+ return data
202
+
203
+ def retrieveProjects(projname):
204
+ print('retrieve')
205
+
206
+ parquetDf=GetParquetDF()
207
+ documentsToMeasure = []
208
+ RelevantDocuments = []
209
+ projnameWithDetails = f'{projname} 01 Project Details'
210
+ # Split the project name into words and convert to lowercase
211
+ matches = set(re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]', projnameWithDetails.lower()))
212
+
213
+ # Convert the 'path_display' column to lowercase for case-insensitive matching
214
+ parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
215
+
216
+ # Create a mask to filter relevant documents
217
+ mask = parquetDf['path_display_lower'].apply(lambda x: all(match in x for match in matches))
218
+
219
+ # Filter RelevantDocuments and documentsToMeasure using the mask
220
+ RelevantDocuments = parquetDf[mask][['name', 'path_display']].values.tolist()
221
+ documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Filter documentsToMeasure for PDF files later if needed
222
+
223
+ # Remove the temporary 'path_display_lower' column
224
+ parquetDf.drop(columns=['path_display_lower'], inplace=True)
225
+ print('done')
226
+ return documentsToMeasure,RelevantDocuments
227
+
228
+