Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- app.py +46 -0
- pdftotext.py +11 -0
- requirements.txt +7 -0
- tsadropboxretrieval.py +228 -0
app.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, request, jsonify, abort
|
| 2 |
+
import tsadropboxretrieval
|
| 3 |
+
import pdftotext
|
| 4 |
+
|
| 5 |
+
app = Flask(__name__)
|
| 6 |
+
# API_KEY = "adrpdftotext" # Replace with your actual API key
|
| 7 |
+
|
| 8 |
+
# def check_api_key():
|
| 9 |
+
# api_key = request.headers.get("x-api-key")
|
| 10 |
+
# if api_key != API_KEY:
|
| 11 |
+
# abort(403) # Forbidden if API key is missing or incorrect
|
| 12 |
+
|
| 13 |
+
@app.route('/process', methods=['POST'])
|
| 14 |
+
def process():
|
| 15 |
+
# check_api_key()
|
| 16 |
+
try:
|
| 17 |
+
print('In process')
|
| 18 |
+
data = request.get_json() # Correct method to get JSON data
|
| 19 |
+
print(data)
|
| 20 |
+
|
| 21 |
+
# Ensure 'pdfpath' is included in the request
|
| 22 |
+
if 'filePath' not in data:
|
| 23 |
+
return jsonify({"error": "Missing 'pdfpath' in request data"}), 400
|
| 24 |
+
|
| 25 |
+
pdfpath = data['filePath']
|
| 26 |
+
|
| 27 |
+
dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
|
| 28 |
+
md, res = dbxTeam.files_download(path=pdfpath)
|
| 29 |
+
pdf_data = res.content
|
| 30 |
+
|
| 31 |
+
# Ensure 'pdftotext.texts_from_pdf' is a valid function
|
| 32 |
+
pdftext = pdftotext.texts_from_pdf(pdf_data)
|
| 33 |
+
|
| 34 |
+
# Prepare response
|
| 35 |
+
response_data = {
|
| 36 |
+
"message": "Data received",
|
| 37 |
+
"input_data": pdftext
|
| 38 |
+
}
|
| 39 |
+
return jsonify(response_data)
|
| 40 |
+
|
| 41 |
+
except Exception as e:
|
| 42 |
+
print(f"Error: {e}")
|
| 43 |
+
return jsonify({"error": str(e)}), 500
|
| 44 |
+
|
| 45 |
+
if __name__ == '__main__':
|
| 46 |
+
app.run(host='0.0.0.0', port=7860)
|
pdftotext.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fitz
|
| 2 |
+
|
| 3 |
+
def texts_from_pdf(input_pdf_data):
|
| 4 |
+
pdf_document = fitz.open('pdf',input_pdf_data)
|
| 5 |
+
|
| 6 |
+
for page_num in range(pdf_document.page_count):
|
| 7 |
+
page = pdf_document[page_num]
|
| 8 |
+
text_instances = page.get_text()
|
| 9 |
+
|
| 10 |
+
print(text_instances)
|
| 11 |
+
return text_instances
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
datasets==2.*
|
| 2 |
+
flask==3.0.0.*
|
| 3 |
+
requests==2.27.*
|
| 4 |
+
sentencepiece==0.1.*
|
| 5 |
+
pymupdf==1.22.5
|
| 6 |
+
dropbox
|
| 7 |
+
numpy<2
|
tsadropboxretrieval.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# -*- coding: utf-8 -*-
|
| 2 |
+
"""TSADropboxRetrieval.ipynb
|
| 3 |
+
|
| 4 |
+
Automatically generated by Colaboratory.
|
| 5 |
+
|
| 6 |
+
Original file is located at
|
| 7 |
+
https://colab.research.google.com/drive/1d-UI3Y-z7Dj-vqu69CxluOUnN4rvsUuE
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
# !pip install dropbox -q
|
| 11 |
+
|
| 12 |
+
# pip install pymupdf #==1.22.5
|
| 13 |
+
|
| 14 |
+
import base64
|
| 15 |
+
import requests
|
| 16 |
+
import json
|
| 17 |
+
import pathlib
|
| 18 |
+
import pandas as pd
|
| 19 |
+
import dropbox
|
| 20 |
+
from dropbox.exceptions import AuthError
|
| 21 |
+
import fitz
|
| 22 |
+
import io
|
| 23 |
+
import re
|
| 24 |
+
import pyarrow
|
| 25 |
+
from io import BytesIO
|
| 26 |
+
|
| 27 |
+
"""### NEW CODE - OCTOBER 26 - Marthe"""
|
| 28 |
+
|
| 29 |
+
files_list=[]
|
| 30 |
+
|
| 31 |
+
app_key='9bljerefjumct38'
|
| 32 |
+
app_secret='nl6k66clw1j1k12'
|
| 33 |
+
access_code='sl.Bou05Rb15xPy851-I1UV8oOabHPY21AEPl5nrYl-Q0ninFSy0kTuRWPSve_JPbd3Z03E7eBY4r9R454rdzDM0AxLkyqrQEDzyAGUwP7kZ7s2CR6EwvdLD2a7Xh8nFEs38voLTH2IHzrQ2QEx7rji4OJ8aSQStKtJkI7_dh8tYHj5'
|
| 34 |
+
refresh_token='qK2VqvbxWMMAAAAAAAAAAXFQvrHM4xUwWUcZ6l5vGOygn1iAA6zlDjmAQNBbZprL'
|
| 35 |
+
basic_auth=base64.b64encode(f'{app_key}:{app_secret}'.encode())
|
| 36 |
+
|
| 37 |
+
def ADR_Access_DropboxTeam(flag):
|
| 38 |
+
if flag=='user':
|
| 39 |
+
dbxTeam = dropbox.DropboxTeam(app_key=app_key,
|
| 40 |
+
app_secret=app_secret,
|
| 41 |
+
oauth2_refresh_token = refresh_token).as_user('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
|
| 42 |
+
elif flag=='admin':
|
| 43 |
+
dbxTeam = dropbox.DropboxTeam(app_key=app_key,
|
| 44 |
+
app_secret=app_secret,
|
| 45 |
+
oauth2_refresh_token = refresh_token).as_admin('dbmid:AACjJg2GKc3tI42iOnD01dd6s0XDyyx6Thw')
|
| 46 |
+
root_namespace_id = dbxTeam.users_get_current_account().root_info.root_namespace_id
|
| 47 |
+
dbxTeam = dbxTeam.with_path_root(dropbox.common.PathRoot.root(root_namespace_id))
|
| 48 |
+
return dbxTeam
|
| 49 |
+
|
| 50 |
+
def getSharedLink(path):
|
| 51 |
+
dbxTeam=ADR_Access_DropboxTeam('user')
|
| 52 |
+
try:
|
| 53 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
|
| 54 |
+
except:
|
| 55 |
+
shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
|
| 56 |
+
return shared_link_metadata.url
|
| 57 |
+
|
| 58 |
+
def handle_entries(entries , files_list):
|
| 59 |
+
|
| 60 |
+
for file in entries:
|
| 61 |
+
if isinstance(file, dropbox.files.FileMetadata):
|
| 62 |
+
# if str(file.name).endswith(".pdf"):
|
| 63 |
+
metadata = {
|
| 64 |
+
'name': file.name,
|
| 65 |
+
'path_display': file.path_display,
|
| 66 |
+
'client_modified': file.client_modified,
|
| 67 |
+
'server_modified': file.server_modified
|
| 68 |
+
}
|
| 69 |
+
|
| 70 |
+
files_list.append(metadata)
|
| 71 |
+
df = pd.DataFrame.from_records(files_list)
|
| 72 |
+
return df
|
| 73 |
+
|
| 74 |
+
def dropbox_connect():
|
| 75 |
+
"""Create a connection to myyyyyyy Dropbox."""
|
| 76 |
+
print('connecy')
|
| 77 |
+
try:
|
| 78 |
+
# print('ayhaga')
|
| 79 |
+
dbxMe = dropbox.Dropbox(
|
| 80 |
+
app_key='67w6ibpa9d2b60x',
|
| 81 |
+
app_secret='d3ecz8g1604fu04',
|
| 82 |
+
oauth2_refresh_token = 'R_LACBBNhysAAAAAAAAAAXt9mMy9OYIV_v4pF45lG6Z8DHNV66rq1q7acWjj_H5g',
|
| 83 |
+
|
| 84 |
+
)
|
| 85 |
+
# dbx=dropbox.Dropbox(access_token)
|
| 86 |
+
|
| 87 |
+
except AuthError as e:
|
| 88 |
+
print('Error connecting to Dropbox with access token: ' + str(e))
|
| 89 |
+
return dbxMe
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def dropbox_upload_file(df, flag=0):
|
| 93 |
+
try:
|
| 94 |
+
dbxTeam= ADR_Access_DropboxTeam('admin')
|
| 95 |
+
path='/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip'
|
| 96 |
+
doc=df.to_parquet()
|
| 97 |
+
dbxTeam.files_delete(path)
|
| 98 |
+
meta=dbxTeam.files_upload(doc,path)
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print('Error uploading file to Dropbox: ' + str(e))
|
| 102 |
+
return dbxTeam
|
| 103 |
+
|
| 104 |
+
def check_if_file_exists(dbxTeam,path):
|
| 105 |
+
try:
|
| 106 |
+
md = dbxTeam.files_get_metadata(path)
|
| 107 |
+
exists_bool = True
|
| 108 |
+
return exists_bool
|
| 109 |
+
except Exception as error_response:
|
| 110 |
+
exists_bool = False
|
| 111 |
+
return exists_bool
|
| 112 |
+
|
| 113 |
+
def uploadanyFile(doc,pdfname,path,flag=0):
|
| 114 |
+
try:
|
| 115 |
+
dbxTeam= ADR_Access_DropboxTeam('admin')
|
| 116 |
+
print('ppp')
|
| 117 |
+
if flag: #tree = doc
|
| 118 |
+
pdfname=str(pdfname).split('.pdf')[0]+'.xml'
|
| 119 |
+
path=path+pdfname
|
| 120 |
+
print(path)
|
| 121 |
+
f = BytesIO()
|
| 122 |
+
doc.write(f, encoding='utf-8', xml_declaration=True)
|
| 123 |
+
exists_bool=check_if_file_exists(dbxTeam,path)
|
| 124 |
+
if exists_bool:
|
| 125 |
+
dbxTeam.files_delete(path)
|
| 126 |
+
meta=dbxTeam.files_upload(f.getvalue() ,path)
|
| 127 |
+
else:
|
| 128 |
+
path=path+pdfname
|
| 129 |
+
exists_bool=check_if_file_exists(dbxTeam,path)
|
| 130 |
+
if exists_bool:
|
| 131 |
+
dbxTeam.files_delete(path)
|
| 132 |
+
meta=dbxTeam.files_upload(doc.write() ,path)
|
| 133 |
+
try:
|
| 134 |
+
print('hereintry')
|
| 135 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path)
|
| 136 |
+
except:
|
| 137 |
+
print('hereinexcept')
|
| 138 |
+
shared_link_metadata=dbxTeam.sharing_create_shared_link(path)
|
| 139 |
+
return shared_link_metadata.url
|
| 140 |
+
except Exception as e:
|
| 141 |
+
print('Error uploading file to Dropbox: ' + str(e))
|
| 142 |
+
|
| 143 |
+
return 'Error uploading file to Dropbox.'
|
| 144 |
+
|
| 145 |
+
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 146 |
+
# Call when the dropbox is updated with new items - if not , call parquet saved version of the df of saved items
|
| 147 |
+
def DropboxItemstoDF(folder_path):
|
| 148 |
+
files_list=[]
|
| 149 |
+
dbxTeam=ADR_Access_DropboxTeam('user') # or pass dbx in parameters
|
| 150 |
+
# folder_path = "/TSA JOBS"
|
| 151 |
+
res = dbxTeam.files_list_folder(path=folder_path, recursive=True )
|
| 152 |
+
# df1=handle_entries(res.entries , files_list)
|
| 153 |
+
if res.has_more:
|
| 154 |
+
while res.has_more:
|
| 155 |
+
res = dbxTeam.files_list_folder_continue(cursor=res.cursor)
|
| 156 |
+
df2=handle_entries(res.entries , files_list)
|
| 157 |
+
|
| 158 |
+
# dbxTeam=dropbox_upload_file(df2)
|
| 159 |
+
# print(df2)
|
| 160 |
+
return df2 , files_list
|
| 161 |
+
|
| 162 |
+
|
| 163 |
+
def GetParquetDF():
|
| 164 |
+
# Initialize Dropbox client
|
| 165 |
+
dbxTeam = ADR_Access_DropboxTeam('user') # or pass dbx in parameters
|
| 166 |
+
# Define the path to the Parquet file on Dropbox
|
| 167 |
+
path = '/TSA JOBS/ADR Test/DropboxDirectory/df.parquet.gzip'
|
| 168 |
+
try:
|
| 169 |
+
# Try to create a shared link with settings
|
| 170 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link_with_settings(path=path)
|
| 171 |
+
except dropbox.exceptions.ApiError:
|
| 172 |
+
# If settings are not supported, create a shared link without settings
|
| 173 |
+
shared_link_metadata = dbxTeam.sharing_create_shared_link(path=path)
|
| 174 |
+
# Get the file content from the shared link
|
| 175 |
+
_, res = dbxTeam.sharing_get_shared_link_file(url=shared_link_metadata.url)
|
| 176 |
+
data = res.content
|
| 177 |
+
# Read the Parquet file content into a pandas DataFrame
|
| 178 |
+
with io.BytesIO(data) as pq_file:
|
| 179 |
+
df = pd.read_parquet(pq_file)
|
| 180 |
+
|
| 181 |
+
return df
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
def getPathtoPDF_File(nameofPDF):
|
| 185 |
+
parquetDf=GetParquetDF()
|
| 186 |
+
nameofPDF=nameofPDF.replace('"', '')
|
| 187 |
+
try:
|
| 188 |
+
path=parquetDf.loc[parquetDf['name'] == nameofPDF, 'path_display'].iloc[0]
|
| 189 |
+
link=getSharedLink(path)
|
| 190 |
+
print(path,link)
|
| 191 |
+
except:
|
| 192 |
+
return 'Project does not exist'
|
| 193 |
+
return path,link
|
| 194 |
+
# parquetDf
|
| 195 |
+
|
| 196 |
+
# getPathtoPDF_File('A5157-EBLA-V5-XX-SH-L-0004-D2-01.pdf')
|
| 197 |
+
def getPDFData(path):
|
| 198 |
+
dbxTeam= ADR_Access_DropboxTeam('admin')
|
| 199 |
+
md, res =dbxTeam.files_download(path)
|
| 200 |
+
data = res.content
|
| 201 |
+
return data
|
| 202 |
+
|
| 203 |
+
def retrieveProjects(projname):
|
| 204 |
+
print('retrieve')
|
| 205 |
+
|
| 206 |
+
parquetDf=GetParquetDF()
|
| 207 |
+
documentsToMeasure = []
|
| 208 |
+
RelevantDocuments = []
|
| 209 |
+
projnameWithDetails = f'{projname} 01 Project Details'
|
| 210 |
+
# Split the project name into words and convert to lowercase
|
| 211 |
+
matches = set(re.split(r'[`\-= ~!@#$%^&*()_+\[\]{};\'\\:"|<,/<>?]', projnameWithDetails.lower()))
|
| 212 |
+
|
| 213 |
+
# Convert the 'path_display' column to lowercase for case-insensitive matching
|
| 214 |
+
parquetDf['path_display_lower'] = parquetDf['path_display'].str.lower()
|
| 215 |
+
|
| 216 |
+
# Create a mask to filter relevant documents
|
| 217 |
+
mask = parquetDf['path_display_lower'].apply(lambda x: all(match in x for match in matches))
|
| 218 |
+
|
| 219 |
+
# Filter RelevantDocuments and documentsToMeasure using the mask
|
| 220 |
+
RelevantDocuments = parquetDf[mask][['name', 'path_display']].values.tolist()
|
| 221 |
+
documentsToMeasure = [doc for doc in RelevantDocuments if doc[0].endswith('.pdf')] # Filter documentsToMeasure for PDF files later if needed
|
| 222 |
+
|
| 223 |
+
# Remove the temporary 'path_display_lower' column
|
| 224 |
+
parquetDf.drop(columns=['path_display_lower'], inplace=True)
|
| 225 |
+
print('done')
|
| 226 |
+
return documentsToMeasure,RelevantDocuments
|
| 227 |
+
|
| 228 |
+
|