Marthee commited on
Commit
44130c7
·
verified ·
1 Parent(s): 00ecae5

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +450 -0
app.py ADDED
@@ -0,0 +1,450 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, render_template, send_file, redirect, url_for, Response
2
+ import tsadropboxretrieval
3
+ # import findInitialMarkups
4
+ import InitialMarkups
5
+ import requests
6
+ import fitz
7
+ from io import BytesIO
8
+ import datetime
9
+ import time
10
+ from threading import Thread
11
+ from urllib.parse import quote, unquote, parse_qs
12
+ # import pdftotext
13
+ import json
14
+ # -------------------- App & Globals --------------------
15
+ app = Flask(__name__)
16
+ pageNumTextFound = 0
17
+ BASE_URL = "https://adr.trevorsadd.co.uk/api/testpage" ##changed this only
18
+ backend_ready = False
19
+ jsonoutput = [] # ensure defined before use
20
+
21
+ # -------------------- Simple Health/Test --------------------
22
+ @app.route("/health", methods=["GET"])
23
+ def health():
24
+ return jsonify(status="ok", time=datetime.datetime.now().isoformat())
25
+
26
+ # -------------------- Root: keep it simple & reliable --------------------
27
+ @app.route("/", methods=["GET"])
28
+ def root():
29
+ # Avoid missing-template errors. Keep it simple so external access works.
30
+ return jsonify(message="FIND APIs root. Use /health or /testpage."), 200
31
+
32
+ # -------------------- Headers Filtering Find 1 Space --------------------
33
+ @app.route('/api/process-data', methods=['POST'])
34
+ def process_headers():
35
+ try:
36
+ data = request.get_json(force=True) or {}
37
+ filePath = data.get('filePath')
38
+ if not filePath:
39
+ return jsonify({"error": "Missing 'filePath'"}), 400
40
+ headers = findInitialMarkups.headersfrompdf(filePath)
41
+ return jsonify(headers)
42
+ except Exception as e:
43
+ print(f"Error in /api/process-data: {e}")
44
+ return jsonify({"error": str(e)}), 500
45
+
46
+ # -------------------- PDF to Text 1 Space --------------------
47
+ @app.route('/processalltext1', methods=['POST'])
48
+ def processalltextTotext():
49
+ try:
50
+ data = request.get_json(force=True) or {}
51
+ pdfpath = data.get('filePath')
52
+ if not pdfpath:
53
+ return jsonify({"error": "Missing 'filePath' in request data"}), 400
54
+ pdftext,filename = pdftotext.texts_from_pdfAllText(pdfpath)
55
+ return jsonify({"message": "Data received", "input_data": pdftext,"Filename:":filename})
56
+ except Exception as e:
57
+ print(f"Error in /processalltext1: {e}")
58
+ return jsonify({"error": str(e)}), 500
59
+
60
+ # -------------------- Keepalive --------------------
61
+ @app.route("/keepaliveapii", methods=["GET", "POST"])
62
+ def keepaliveapi():
63
+ try:
64
+ print('Keepalive pinged')
65
+ return 'alivee'
66
+ except Exception as error:
67
+ print('Error in keepalive:', error)
68
+ return jsonify(status="error", message=str(error)), 500
69
+
70
+ # -------------------- View PDF (Marked up) --------------------
71
+ def getpdfcontent(pdf_path):
72
+ # Handle Dropbox URLs
73
+ if pdf_path and ('http' in pdf_path or 'dropbox' in pdf_path):
74
+ pdf_path = pdf_path.replace('dl=0', 'dl=1')
75
+
76
+ # Get the PDF bytes
77
+ response = requests.get(pdf_path)
78
+ pdf_bytes = response.content
79
+
80
+ if not pdf_bytes or not pdf_bytes.startswith(b"%PDF"):
81
+ raise ValueError("No valid PDF content found.")
82
+
83
+ # Return a BytesIO stream
84
+ return BytesIO(pdf_bytes)
85
+
86
+
87
+ @app.route('/view-pdf', methods=['GET'])
88
+ def view_pdf():
89
+ encoded_pdf_link = request.args.get('pdfLink')
90
+ if not encoded_pdf_link:
91
+ return "Missing pdfLink parameter.", 400
92
+
93
+ pdf_link = unquote(encoded_pdf_link)
94
+ print("Extracted PDF Link:", pdf_link)
95
+
96
+ try:
97
+ pdf_content = getpdfcontent(pdf_link)
98
+ except Exception as e:
99
+ print("Error during PDF extraction:", e)
100
+ return "PDF could not be processed.", 500
101
+
102
+ if pdf_content is None:
103
+ return "PDF content not found or broken.", 404
104
+
105
+ # ✅ Do NOT wrap again in BytesIO
106
+ return send_file(
107
+ pdf_content,
108
+ mimetype='application/pdf',
109
+ as_attachment=False,
110
+ download_name="annotated_page.pdf"
111
+ )
112
+
113
+ # -------------------- Process PDF -> Upload to Dropbox (renamed to avoid duplicate route) --------------------
114
+ @app.route('/api/process-pdf', methods=['POST'])
115
+ def process_pdf_and_upload():
116
+ try:
117
+ data = request.get_json(force=True) or {}
118
+ pdfLink = data.get('filePath')
119
+ if not pdfLink:
120
+ return jsonify({"error": "'filePath' must be provided."}), 400
121
+
122
+ print("Processing PDF:", pdfLink)
123
+ pdfbytes, pdf_document, tablepdfoutput = InitialMarkups.extract_section_under_header(pdfLink)
124
+
125
+ dbxTeam = tsadropboxretrieval.ADR_Access_DropboxTeam('user')
126
+ metadata = dbxTeam.sharing_get_shared_link_metadata(pdfLink)
127
+
128
+ dbPath = '/TSA JOBS/ADR Test/FIND/'
129
+ pdflink = tsadropboxretrieval.uploadanyFile(doc=pdf_document, path=dbPath, pdfname=metadata.name)
130
+ tablepdfLink = tsadropboxretrieval.uploadanyFile(
131
+ doc=tablepdfoutput,
132
+ path=dbPath,
133
+ pdfname=metadata.name.rsplit(".pdf", 1)[0] + ' Markup Summary.pdf'
134
+ )
135
+ print('Uploaded:', pdflink, tablepdfLink)
136
+
137
+ return jsonify({
138
+ "message": "PDF processed successfully.",
139
+ "PDF_MarkedUp": pdflink,
140
+ "Table_PDF_Markup_Summary": tablepdfLink
141
+ })
142
+ except Exception as e:
143
+ print(f"Error in /api/process-pdf: {e}")
144
+ return jsonify({"error": str(e)}), 500
145
+
146
+ # -------------------- Not billed / Markup subsets --------------------
147
+ @app.route('/findapitobebilled1', methods=['GET','POST'])
148
+ def findapitobebilled1():
149
+ try:
150
+ data = request.get_json(force=True) or {}
151
+ pdfLink = data.get('filePath')
152
+ if not pdfLink:
153
+ return jsonify({"error": "Missing 'filePath'"}), 400
154
+ pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled , filename = InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
155
+
156
+ return jsonify(alltext_tobebilled)
157
+ except Exception as e:
158
+ print(f"Error in /findapitobebilled1: {e}")
159
+ return jsonify({"error": str(e)}), 500
160
+
161
+
162
+ # ----------------------------------------------------------------------
163
+ @app.route('/findapitobebilled_htmlformat', methods=['GET','POST'])
164
+ def findapitobebilled_htmlformat():
165
+ try:
166
+ data = request.get_json(force=True) or {}
167
+ pdfLink = data.get('filePath')
168
+ if not pdfLink:
169
+ return jsonify({"error": "Missing 'filePath'"}), 400
170
+ pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled , filename = InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
171
+ # Parse JSON string → list of dicts
172
+ data = json.loads(tablepdfoutput)
173
+
174
+ # Collect all body parts
175
+ html_body = ""
176
+
177
+ for section in data:
178
+ if "head above 2" in section:
179
+ html_body += f"<h1>{section['head above 2']}</h1><br>"
180
+
181
+ if "head above 1" in section:
182
+ html_body += f"<h2>{section['head above 1']}</h2><br>"
183
+
184
+ if "Subject" in section:
185
+ html_body += f"<h3>{section['Subject']}</h3><br>"
186
+ if "BodyText" in section:
187
+ html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
188
+ # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
189
+
190
+ # Wrap everything into one HTML document
191
+ html_content = f"""
192
+ <!DOCTYPE html>
193
+ <html>
194
+ <head>
195
+ <title>{filename}</title>
196
+ <meta charset="utf-8">
197
+ </head>
198
+ <body>
199
+ {html_body}
200
+ </body>
201
+ </html>
202
+ """
203
+ # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
204
+ return jsonify({"input_data": html_content,"Filename:":filename})
205
+ except Exception as e:
206
+ print(f"Error in /findapitobebilled_htmlformat: {e}")
207
+ return jsonify({"error": str(e)}), 500
208
+
209
+
210
+ @app.route('/view-pdf-tobebilled', methods=['GET'])
211
+ def view_pdf_tobebilled():
212
+ encoded_pdf_link = request.args.get('pdfLink')
213
+ if not encoded_pdf_link:
214
+ return "Missing pdfLink parameter.", 400
215
+ pdf_link = unquote(encoded_pdf_link)
216
+ print("Extracted PDF Link:", pdf_link)
217
+ try:
218
+ pdf_content = InitialMarkups.extract_section_under_header_tobebilledOnly(pdf_link)[0]
219
+ except Exception as e:
220
+ print("Error during PDF extraction:", e)
221
+ return "PDF could not be processed.", 500
222
+ if pdf_content is None or not pdf_content.startswith(b"%PDF"):
223
+ return "PDF content not found or broken.", 404
224
+ return send_file(
225
+ BytesIO(pdf_content),
226
+ mimetype='application/pdf',
227
+ as_attachment=False,
228
+ download_name=f"annotated_page_{pageNumTextFound}.pdf"
229
+ )
230
+
231
+ # -------------------- Final markups: view one highlight --------------------
232
+ @app.route('/view-highlight', methods=['GET','POST'])
233
+ def download_pdfHighlight():
234
+ pdf_link = request.args.get('pdfLink')
235
+ keyword = request.args.get('keyword')
236
+ if not pdf_link or not keyword:
237
+ return "Missing required parameters.", 400
238
+
239
+ pdf_link = unquote(pdf_link)
240
+ print("Extracted PDF Link:", pdf_link)
241
+ print("Extracted Keyword:", keyword)
242
+
243
+ global jsonoutput
244
+ matching_item = next((item for item in jsonoutput if item.get("Subject") == keyword), None)
245
+
246
+ if matching_item:
247
+ page_number = int(matching_item.get("Page")) - 1
248
+ stringtowrite = matching_item.get("head above 1")
249
+ print(f"Page number for '{keyword}': {page_number}")
250
+ else:
251
+ page_number = 0
252
+ stringtowrite = None
253
+ print("No match found in jsonoutput; defaulting to page 0.")
254
+
255
+ pdf_content = InitialMarkups.extract_section_under_headerRawan(pdf_link, keyword, page_number, stringtowrite)[0]
256
+ if pdf_content is None:
257
+ return "PDF content not found.", 404
258
+
259
+ return send_file(
260
+ BytesIO(pdf_content),
261
+ mimetype='application/pdf',
262
+ as_attachment=False,
263
+ download_name=f"annotated_page_{pageNumTextFound}.pdf"
264
+ )
265
+
266
+ @app.route('/findapiFilteredHeadings', methods=['GET','POST'])
267
+ def findapiFilteredHeadings():
268
+ try:
269
+ data = request.get_json(force=True) or {}
270
+ pdfLink = data.get('filePath')
271
+ listofheadings = data.get('listofheadings') # json array
272
+ if not pdfLink or listofheadings is None:
273
+ return jsonify({"error": "Missing 'filePath' or 'listofheadings'"}), 400
274
+
275
+ pdfbytes, pdf_document, tablepdfoutput, alltext = InitialMarkups.extract_section_under_headerRawan(pdfLink, listofheadings)
276
+ global jsonoutput
277
+ jsonoutput = tablepdfoutput
278
+ return jsonify(alltext)
279
+ except Exception as e:
280
+ print(f"Error in /findapiFilteredHeadings: {e}")
281
+ return jsonify({"error": str(e)}), 500
282
+
283
+ @app.route('/findapitobebilledonlyNew', methods=['GET','POST'])
284
+ def findapitobebilledonly():
285
+ try:
286
+ data = request.get_json(force=True) or {}
287
+ pdfLink = data.get('filePath')
288
+ if not pdfLink:
289
+ return jsonify({"error": "Missing 'filePath'"}), 400
290
+ pdfbytes, pdf_document, tablepdfoutput, alltext , filename= InitialMarkups.extract_section_under_header_tobebilled2(pdfLink)
291
+ # return jsonify(tablepdfoutput)
292
+ # Parse JSON string → list of dicts
293
+ data = json.loads(tablepdfoutput)
294
+
295
+ # Collect all body parts
296
+ html_body = ""
297
+
298
+ for section in data:
299
+ if "head above 2" in section:
300
+ html_body += f"<h1>{section['head above 2']}</h1><br>"
301
+
302
+ if "head above 1" in section:
303
+ html_body += f"<h2>{section['head above 1']}</h2><br>"
304
+
305
+ if "Subject" in section:
306
+ html_body += f"<h3>{section['Subject']}</h3><br>"
307
+ if "BodyText" in section:
308
+ html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
309
+ # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
310
+
311
+ # Wrap everything into one HTML document
312
+ html_content = f"""
313
+ <!DOCTYPE html>
314
+ <html>
315
+ <head>
316
+ <title>{filename}</title>
317
+ <meta charset="utf-8">
318
+ </head>
319
+ <body>
320
+ {html_body}
321
+ </body>
322
+ </html>
323
+ """
324
+ # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
325
+ return jsonify({"input_data": html_content,"Filename:":filename})
326
+ # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
327
+ except Exception as e:
328
+ print(f"Error in /findapitobebilledonly: {e}")
329
+ return jsonify({"error": str(e)}), 500
330
+
331
+
332
+
333
+ @app.route('/findapitobebilledonlyNewMultiplePDFS', methods=['GET','POST'])
334
+ def findapitobebilledonlymarthe():
335
+ try:
336
+ data = request.get_json(force=True) or {}
337
+ pdfLink = data.get('filePath')
338
+ if not pdfLink:
339
+ return jsonify({"error": "Missing 'filePath'"}), 400
340
+ pdfbytes, pdf_document, tablepdfoutput, alltext , filename= InitialMarkups.extract_section_under_header_tobebilledMultiplePDFS(pdfLink)
341
+ # return jsonify(tablepdfoutput)
342
+ # Parse JSON string → list of dicts
343
+ if isinstance(tablepdfoutput, str):
344
+ data = json.loads(tablepdfoutput)
345
+ else:
346
+ data = tablepdfoutput
347
+ # Collect all body parts
348
+ html_body = ""
349
+
350
+ for section in data:
351
+ if "head above 2" in section:
352
+ html_body += f"<h1>{section['head above 2']}</h1><br>"
353
+
354
+ if "head above 1" in section:
355
+ html_body += f"<h2>{section['head above 1']}</h2><br>"
356
+
357
+ if "Subject" in section:
358
+ html_body += f"<h3>{section['Subject']}</h3><br>"
359
+ if "BodyText" in section:
360
+ html_body += f"<p>{' '.join(section['BodyText'])}</p><br>"
361
+ # html_body += f"<div>{' '.join(section['bodytext'])}</div><br>"
362
+
363
+ # Wrap everything into one HTML document
364
+ html_content = f"""
365
+ <!DOCTYPE html>
366
+ <html>
367
+ <head>
368
+ <title>{filename}</title>
369
+ <meta charset="utf-8">
370
+ </head>
371
+ <body>
372
+ {html_body}
373
+ </body>
374
+ </html>
375
+ """
376
+ # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
377
+ return jsonify({"input_data": html_content,"Filename:":filename})
378
+ # return Response(html_content, mimetype="text/html", headers={"Filename": filename})
379
+ except Exception as e:
380
+ print(f"Error in /findapitobebilledonly: {e}")
381
+ return jsonify({"error": str(e)}), 500
382
+
383
+
384
+ @app.route('/findapiAllDocNoNotbilled', methods=['GET','POST'])
385
+ def findapiAllDocNoNotbilled():
386
+ try:
387
+ data = request.get_json(force=True) or {}
388
+ pdfLink = data.get('filePath')
389
+ if not pdfLink:
390
+ return jsonify({"error": "Missing 'filePath'"}), 400
391
+ pdfbytes, pdf_document, tablepdfoutput, alltext_tobebilled, alltextNoNotbilled ,filename= InitialMarkups.extract_section_under_header_tobebilledOnly(pdfLink)
392
+ return jsonify(alltextNoNotbilled)
393
+ except Exception as e:
394
+ print(f"Error in /findapiAllDocNoNotbilled: {e}")
395
+ return jsonify({"error": str(e)}), 500
396
+
397
+ # -------------------- Rawan - MC Connection --------------------
398
+ @app.route('/findapi', methods=['GET','POST'])
399
+ def findapi():
400
+ try:
401
+ data = request.get_json(force=True) or {}
402
+ pdfLink = data.get('filePath')
403
+ if not pdfLink:
404
+ return jsonify({"error": "Missing 'filePath'"}), 400
405
+
406
+ pdfbytes, pdf_document, tablepdfoutput = InitialMarkups.extract_section_under_header(pdfLink)
407
+ global jsonoutput
408
+ jsonoutput = tablepdfoutput
409
+ return jsonify(tablepdfoutput)
410
+ except Exception as e:
411
+ print(f"Error in /findapi: {e}")
412
+ return jsonify({"error": str(e)}), 500
413
+
414
+ #--------------------testpage-----------------------------
415
+ import socket
416
+ from datetime import datetime
417
+
418
+ @app.route('/testpage')
419
+ def test_page():
420
+ # Get some system info
421
+ hostname = socket.gethostname()
422
+ current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
423
+
424
+ return f"""
425
+ <!DOCTYPE html>
426
+ <html>
427
+ <head>
428
+ <title>Server Test Page</title>
429
+ <style>
430
+ body {{ font-family: Arial, sans-serif; text-align: center; margin-top: 50px; }}
431
+ .success {{ color: #2ecc71; font-size: 24px; }}
432
+ .info {{ color: #34495e; margin-top: 10px; }}
433
+ .container {{ max-width: 600px; margin: 0 auto; text-align: left; }}
434
+ </style>
435
+ </head>
436
+ <body>
437
+ <div class="success">🚀 Flask Server is Running!</div>
438
+ <div class="container">
439
+ <p class="info"><strong>Hostname:</strong> {hostname}</p>
440
+ <p class="info"><strong>Server Time:</strong> {current_time}</p>
441
+ <p class="info"><strong>Endpoint:</strong> /testpage</p>
442
+ <p class="info"><strong>Status:</strong> <span style="color: #2ecc71;">Operational ✅</span></p>
443
+ </div>
444
+ </body>
445
+ </html>
446
+ """
447
+
448
+ # -------------------- Run --------------------
449
+ if __name__ == "__main__":
450
+ app.run(host="0.0.0.0", port=5000, debug=True)